traject 3.1.0 → 3.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7bc0afc820efb8a2479d96913ae67552556978d264e20a55d0a4d4cef5ff9a2a
4
- data.tar.gz: 773529d885d46d2ba0afd323d680724678225bd3616a720dba92809479395dce
3
+ metadata.gz: 9a69bdb470c759d08d117f910ec1c6b16cbe93dbb3e955b653a07e3c03efec68
4
+ data.tar.gz: 68016ddf17fe29348248e5f26ff5a813c1485c37a0ac014faa69803be182f7b4
5
5
  SHA512:
6
- metadata.gz: 395f1a3cf62cfd0cccbdcd428830feb098586c9ca1edc7adecff31483545a62dbc06631e7c166ea0ce1b229afeddcec7d994245d78871f1a6644c591d6cc432c
7
- data.tar.gz: 647f9cfbf0a7a0876cef29b5a90a9fd6d3dd1f6a1010b332d58e132875b5c4a7f8be9bdab1afabb5ec0bc583af9ea0d8a422cc87eec6c04d93c9fe9f4b4d1e9a
6
+ metadata.gz: 8921a5d2349025291f28c7d39ebec44c0c0ebdc11a489e1d4c22913c04c840cf40163c576b14432794f1d663b9b81eb1d777b87d38af2b429f4b6ac98982ad48
7
+ data.tar.gz: 8bdb95f98baa11ee60c74a7a4bbab06c44fb5da466138d098b05e5ab3fddc28a1fe8926044be5350e5e0b8a91ccd2d6e2e2ecf78424cfc0430464dd0ccf79c66
data/CHANGES.md CHANGED
@@ -1,5 +1,24 @@
1
1
  # Changes
2
2
 
3
+ ## Next
4
+
5
+ *
6
+
7
+ *
8
+
9
+ *
10
+
11
+ ## 3.2.0
12
+
13
+ * NokogiriReader has a "nokogiri.strict_mode" setting. Set to true or string 'true' to ask Nokogori to parse in strict mode, so it will immediately raise on ill-formed XML, instead of nokogiri's default to do what it can with it. https://github.com/traject/traject/pull/226
14
+
15
+ * SolrJsonWriter
16
+
17
+ * Utility method `delete_all!` sends a delete all query to the Solr URL endpoint. https://github.com/traject/traject/pull/227
18
+
19
+ * Allow basic auth configuration of the default http client via `solr_writer.basic_auth_user` and `solr_writer.basic_auth_password`. https://github.com/traject/traject/pull/231
20
+
21
+
3
22
  ## 3.1.0
4
23
 
5
24
  ### Added
data/doc/settings.md CHANGED
@@ -93,6 +93,8 @@ settings are applied first of all. It's recommended you use `provide`.
93
93
 
94
94
  * `solr_writer.thread_pool`: defaults to 1 (single bg thread). A thread pool is used for submitting docs to solr. Set to 0 or nil to disable threading. Set to 1, there will still be a single bg thread doing the adds. May make sense to set higher than number of cores on your indexing machine, as these threads will mostly be waiting on Solr. Speed/capacity of your solr might be more relevant. Note that processing_thread_pool threads can end up submitting to solr too, if solr_json_writer.thread_pool is full.
95
95
 
96
+ * `solr_writer.basic_auth_user`, `solr_writer.basic_auth_password`: Not set by default but when both are set the default writer is configured with basic auth.
97
+
96
98
 
97
99
  ### Dealing with MARC data
98
100
 
@@ -21,6 +21,9 @@ module Traject
21
21
  # If you need to use namespaces here, you need to have them registered with
22
22
  # `nokogiri.default_namespaces`. If your source docs use namespaces, you DO need
23
23
  # to use them in your each_record_xpath.
24
+ # * nokogiri.strict_mode: if set to `true` or `"true"`, ask Nokogiri to parse in 'strict'
25
+ # mode, it will raise a `Nokogiri::XML::SyntaxError` if the XML is not well-formed, instead
26
+ # of trying to take it's best-guess correction. https://nokogiri.org/tutorials/ensuring_well_formed_markup.html
24
27
  # * nokogiri_reader.extra_xpath_hooks: Experimental in progress, see below.
25
28
  #
26
29
  # ## nokogiri_reader.extra_xpath_hooks: For handling nodes outside of your each_record_xpath
@@ -87,7 +90,11 @@ module Traject
87
90
  end
88
91
 
89
92
  def each
90
- whole_input_doc = Nokogiri::XML.parse(input_stream)
93
+ config_proc = if settings["nokogiri.strict_mode"]
94
+ proc { |config| config.strict }
95
+ end
96
+
97
+ whole_input_doc = Nokogiri::XML.parse(input_stream, &config_proc)
91
98
 
92
99
  if each_record_xpath
93
100
  whole_input_doc.xpath(each_record_xpath, default_namespaces).each do |matching_node|
@@ -114,6 +114,12 @@ class Traject::SolrJsonWriter
114
114
  if @settings["solr_writer.http_timeout"]
115
115
  client.connect_timeout = client.receive_timeout = client.send_timeout = @settings["solr_writer.http_timeout"]
116
116
  end
117
+
118
+ if @settings["solr_writer.basic_auth_user"] &&
119
+ @settings["solr_writer.basic_auth_password"]
120
+ client.set_auth(@settings["solr.url"], @settings["solr_writer.basic_auth_user"], @settings["solr_writer.basic_auth_password"])
121
+ end
122
+
117
123
  client
118
124
  end
119
125
 
@@ -270,6 +276,13 @@ class Traject::SolrJsonWriter
270
276
  end
271
277
  end
272
278
 
279
+ # Send a delete all query.
280
+ #
281
+ # This method takes no params and will not automatically commit the deletes.
282
+ # @example @writer.delete_all!
283
+ def delete_all!
284
+ delete(query: "*:*")
285
+ end
273
286
 
274
287
  # Get the logger from the settings, or default to an effectively null logger
275
288
  def logger
@@ -1,3 +1,3 @@
1
1
  module Traject
2
- VERSION = "3.1.0"
2
+ VERSION = "3.2.0"
3
3
  end
@@ -134,6 +134,16 @@ describe "Traject::NokogiriReader" do
134
134
  end
135
135
  end
136
136
 
137
+ describe "strict_mode" do
138
+ it "raises on non-well-formed" do
139
+ # invalid because two sibling root nodes, XML requiers one root node
140
+ reader = Traject::NokogiriReader.new(StringIO.new("<doc></doc><doc></doc>"), {"nokogiri.strict_mode" => "true" })
141
+ assert_raises(Nokogiri::XML::SyntaxError) {
142
+ reader.each { |r| }
143
+ }
144
+ end
145
+ end
146
+
137
147
 
138
148
  def shared_tests
139
149
  @reader = Traject::NokogiriReader.new(File.open(@xml_sample_path), {
@@ -170,6 +170,27 @@ describe "Traject::SolrJsonWriter" do
170
170
  assert_length 1, @fake_http_client.post_args, "Has flushed to solr"
171
171
  end
172
172
 
173
+ it "defaults to not setting basic authentication" do
174
+ settings = { "solr.url" => "http://example.com/solr/foo" }
175
+ writer = Traject::SolrJsonWriter.new(settings)
176
+ auth = writer.instance_variable_get("@http_client")
177
+ .www_auth.basic_auth.instance_variable_get("@auth")
178
+ assert(auth.empty?)
179
+ end
180
+
181
+ it "allows basic authentication setup" do
182
+ settings = {
183
+ "solr.url" => "http://example.com/solr/foo",
184
+ "solr_writer.basic_auth_user" => "foo",
185
+ "solr_writer.basic_auth_password" => "bar",
186
+ }
187
+
188
+ writer = Traject::SolrJsonWriter.new(settings)
189
+ auth = writer.instance_variable_get("@http_client")
190
+ .www_auth.basic_auth.instance_variable_get("@auth")
191
+ assert(!auth.empty?)
192
+ end
193
+
173
194
  describe "commit" do
174
195
  it "commits on close when set" do
175
196
  @writer = create_writer("solr.url" => "http://example.com", "solr_writer.commit_on_close" => "true")
@@ -365,4 +386,13 @@ describe "Traject::SolrJsonWriter" do
365
386
  end
366
387
  end
367
388
  end
389
+
390
+ describe "#delete_all!" do
391
+ it "deletes all" do
392
+ @writer.delete_all!
393
+ post_args = @fake_http_client.post_args.first
394
+ assert_equal "http://example.com/solr/update/json", post_args[0]
395
+ assert_equal JSON.generate({"delete" => { "query" => "*:*"}}), post_args[1]
396
+ end
397
+ end
368
398
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: traject
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.1.0
4
+ version: 3.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Rochkind
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2019-04-22 00:00:00.000000000 Z
12
+ date: 2019-09-10 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: concurrent-ruby