traject 3.1.0 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7bc0afc820efb8a2479d96913ae67552556978d264e20a55d0a4d4cef5ff9a2a
4
- data.tar.gz: 773529d885d46d2ba0afd323d680724678225bd3616a720dba92809479395dce
3
+ metadata.gz: 9a69bdb470c759d08d117f910ec1c6b16cbe93dbb3e955b653a07e3c03efec68
4
+ data.tar.gz: 68016ddf17fe29348248e5f26ff5a813c1485c37a0ac014faa69803be182f7b4
5
5
  SHA512:
6
- metadata.gz: 395f1a3cf62cfd0cccbdcd428830feb098586c9ca1edc7adecff31483545a62dbc06631e7c166ea0ce1b229afeddcec7d994245d78871f1a6644c591d6cc432c
7
- data.tar.gz: 647f9cfbf0a7a0876cef29b5a90a9fd6d3dd1f6a1010b332d58e132875b5c4a7f8be9bdab1afabb5ec0bc583af9ea0d8a422cc87eec6c04d93c9fe9f4b4d1e9a
6
+ metadata.gz: 8921a5d2349025291f28c7d39ebec44c0c0ebdc11a489e1d4c22913c04c840cf40163c576b14432794f1d663b9b81eb1d777b87d38af2b429f4b6ac98982ad48
7
+ data.tar.gz: 8bdb95f98baa11ee60c74a7a4bbab06c44fb5da466138d098b05e5ab3fddc28a1fe8926044be5350e5e0b8a91ccd2d6e2e2ecf78424cfc0430464dd0ccf79c66
data/CHANGES.md CHANGED
@@ -1,5 +1,24 @@
1
1
  # Changes
2
2
 
3
+ ## Next
4
+
5
+ *
6
+
7
+ *
8
+
9
+ *
10
+
11
+ ## 3.2.0
12
+
13
+ * NokogiriReader has a "nokogiri.strict_mode" setting. Set to true or string 'true' to ask Nokogori to parse in strict mode, so it will immediately raise on ill-formed XML, instead of nokogiri's default to do what it can with it. https://github.com/traject/traject/pull/226
14
+
15
+ * SolrJsonWriter
16
+
17
+ * Utility method `delete_all!` sends a delete all query to the Solr URL endpoint. https://github.com/traject/traject/pull/227
18
+
19
+ * Allow basic auth configuration of the default http client via `solr_writer.basic_auth_user` and `solr_writer.basic_auth_password`. https://github.com/traject/traject/pull/231
20
+
21
+
3
22
  ## 3.1.0
4
23
 
5
24
  ### Added
data/doc/settings.md CHANGED
@@ -93,6 +93,8 @@ settings are applied first of all. It's recommended you use `provide`.
93
93
 
94
94
  * `solr_writer.thread_pool`: defaults to 1 (single bg thread). A thread pool is used for submitting docs to solr. Set to 0 or nil to disable threading. Set to 1, there will still be a single bg thread doing the adds. May make sense to set higher than number of cores on your indexing machine, as these threads will mostly be waiting on Solr. Speed/capacity of your solr might be more relevant. Note that processing_thread_pool threads can end up submitting to solr too, if solr_json_writer.thread_pool is full.
95
95
 
96
+ * `solr_writer.basic_auth_user`, `solr_writer.basic_auth_password`: Not set by default but when both are set the default writer is configured with basic auth.
97
+
96
98
 
97
99
  ### Dealing with MARC data
98
100
 
@@ -21,6 +21,9 @@ module Traject
21
21
  # If you need to use namespaces here, you need to have them registered with
22
22
  # `nokogiri.default_namespaces`. If your source docs use namespaces, you DO need
23
23
  # to use them in your each_record_xpath.
24
+ # * nokogiri.strict_mode: if set to `true` or `"true"`, ask Nokogiri to parse in 'strict'
25
+ # mode, it will raise a `Nokogiri::XML::SyntaxError` if the XML is not well-formed, instead
26
+ # of trying to take it's best-guess correction. https://nokogiri.org/tutorials/ensuring_well_formed_markup.html
24
27
  # * nokogiri_reader.extra_xpath_hooks: Experimental in progress, see below.
25
28
  #
26
29
  # ## nokogiri_reader.extra_xpath_hooks: For handling nodes outside of your each_record_xpath
@@ -87,7 +90,11 @@ module Traject
87
90
  end
88
91
 
89
92
  def each
90
- whole_input_doc = Nokogiri::XML.parse(input_stream)
93
+ config_proc = if settings["nokogiri.strict_mode"]
94
+ proc { |config| config.strict }
95
+ end
96
+
97
+ whole_input_doc = Nokogiri::XML.parse(input_stream, &config_proc)
91
98
 
92
99
  if each_record_xpath
93
100
  whole_input_doc.xpath(each_record_xpath, default_namespaces).each do |matching_node|
@@ -114,6 +114,12 @@ class Traject::SolrJsonWriter
114
114
  if @settings["solr_writer.http_timeout"]
115
115
  client.connect_timeout = client.receive_timeout = client.send_timeout = @settings["solr_writer.http_timeout"]
116
116
  end
117
+
118
+ if @settings["solr_writer.basic_auth_user"] &&
119
+ @settings["solr_writer.basic_auth_password"]
120
+ client.set_auth(@settings["solr.url"], @settings["solr_writer.basic_auth_user"], @settings["solr_writer.basic_auth_password"])
121
+ end
122
+
117
123
  client
118
124
  end
119
125
 
@@ -270,6 +276,13 @@ class Traject::SolrJsonWriter
270
276
  end
271
277
  end
272
278
 
279
+ # Send a delete all query.
280
+ #
281
+ # This method takes no params and will not automatically commit the deletes.
282
+ # @example @writer.delete_all!
283
+ def delete_all!
284
+ delete(query: "*:*")
285
+ end
273
286
 
274
287
  # Get the logger from the settings, or default to an effectively null logger
275
288
  def logger
@@ -1,3 +1,3 @@
1
1
  module Traject
2
- VERSION = "3.1.0"
2
+ VERSION = "3.2.0"
3
3
  end
@@ -134,6 +134,16 @@ describe "Traject::NokogiriReader" do
134
134
  end
135
135
  end
136
136
 
137
+ describe "strict_mode" do
138
+ it "raises on non-well-formed" do
139
+ # invalid because two sibling root nodes, XML requiers one root node
140
+ reader = Traject::NokogiriReader.new(StringIO.new("<doc></doc><doc></doc>"), {"nokogiri.strict_mode" => "true" })
141
+ assert_raises(Nokogiri::XML::SyntaxError) {
142
+ reader.each { |r| }
143
+ }
144
+ end
145
+ end
146
+
137
147
 
138
148
  def shared_tests
139
149
  @reader = Traject::NokogiriReader.new(File.open(@xml_sample_path), {
@@ -170,6 +170,27 @@ describe "Traject::SolrJsonWriter" do
170
170
  assert_length 1, @fake_http_client.post_args, "Has flushed to solr"
171
171
  end
172
172
 
173
+ it "defaults to not setting basic authentication" do
174
+ settings = { "solr.url" => "http://example.com/solr/foo" }
175
+ writer = Traject::SolrJsonWriter.new(settings)
176
+ auth = writer.instance_variable_get("@http_client")
177
+ .www_auth.basic_auth.instance_variable_get("@auth")
178
+ assert(auth.empty?)
179
+ end
180
+
181
+ it "allows basic authentication setup" do
182
+ settings = {
183
+ "solr.url" => "http://example.com/solr/foo",
184
+ "solr_writer.basic_auth_user" => "foo",
185
+ "solr_writer.basic_auth_password" => "bar",
186
+ }
187
+
188
+ writer = Traject::SolrJsonWriter.new(settings)
189
+ auth = writer.instance_variable_get("@http_client")
190
+ .www_auth.basic_auth.instance_variable_get("@auth")
191
+ assert(!auth.empty?)
192
+ end
193
+
173
194
  describe "commit" do
174
195
  it "commits on close when set" do
175
196
  @writer = create_writer("solr.url" => "http://example.com", "solr_writer.commit_on_close" => "true")
@@ -365,4 +386,13 @@ describe "Traject::SolrJsonWriter" do
365
386
  end
366
387
  end
367
388
  end
389
+
390
+ describe "#delete_all!" do
391
+ it "deletes all" do
392
+ @writer.delete_all!
393
+ post_args = @fake_http_client.post_args.first
394
+ assert_equal "http://example.com/solr/update/json", post_args[0]
395
+ assert_equal JSON.generate({"delete" => { "query" => "*:*"}}), post_args[1]
396
+ end
397
+ end
368
398
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: traject
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.1.0
4
+ version: 3.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Rochkind
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2019-04-22 00:00:00.000000000 Z
12
+ date: 2019-09-10 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: concurrent-ruby