traject 3.1.0 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGES.md +19 -0
- data/doc/settings.md +2 -0
- data/lib/traject/nokogiri_reader.rb +8 -1
- data/lib/traject/solr_json_writer.rb +13 -0
- data/lib/traject/version.rb +1 -1
- data/test/nokogiri_reader_test.rb +10 -0
- data/test/solr_json_writer_test.rb +30 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9a69bdb470c759d08d117f910ec1c6b16cbe93dbb3e955b653a07e3c03efec68
|
4
|
+
data.tar.gz: 68016ddf17fe29348248e5f26ff5a813c1485c37a0ac014faa69803be182f7b4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8921a5d2349025291f28c7d39ebec44c0c0ebdc11a489e1d4c22913c04c840cf40163c576b14432794f1d663b9b81eb1d777b87d38af2b429f4b6ac98982ad48
|
7
|
+
data.tar.gz: 8bdb95f98baa11ee60c74a7a4bbab06c44fb5da466138d098b05e5ab3fddc28a1fe8926044be5350e5e0b8a91ccd2d6e2e2ecf78424cfc0430464dd0ccf79c66
|
data/CHANGES.md
CHANGED
@@ -1,5 +1,24 @@
|
|
1
1
|
# Changes
|
2
2
|
|
3
|
+
## Next
|
4
|
+
|
5
|
+
*
|
6
|
+
|
7
|
+
*
|
8
|
+
|
9
|
+
*
|
10
|
+
|
11
|
+
## 3.2.0
|
12
|
+
|
13
|
+
* NokogiriReader has a "nokogiri.strict_mode" setting. Set to true or string 'true' to ask Nokogori to parse in strict mode, so it will immediately raise on ill-formed XML, instead of nokogiri's default to do what it can with it. https://github.com/traject/traject/pull/226
|
14
|
+
|
15
|
+
* SolrJsonWriter
|
16
|
+
|
17
|
+
* Utility method `delete_all!` sends a delete all query to the Solr URL endpoint. https://github.com/traject/traject/pull/227
|
18
|
+
|
19
|
+
* Allow basic auth configuration of the default http client via `solr_writer.basic_auth_user` and `solr_writer.basic_auth_password`. https://github.com/traject/traject/pull/231
|
20
|
+
|
21
|
+
|
3
22
|
## 3.1.0
|
4
23
|
|
5
24
|
### Added
|
data/doc/settings.md
CHANGED
@@ -93,6 +93,8 @@ settings are applied first of all. It's recommended you use `provide`.
|
|
93
93
|
|
94
94
|
* `solr_writer.thread_pool`: defaults to 1 (single bg thread). A thread pool is used for submitting docs to solr. Set to 0 or nil to disable threading. Set to 1, there will still be a single bg thread doing the adds. May make sense to set higher than number of cores on your indexing machine, as these threads will mostly be waiting on Solr. Speed/capacity of your solr might be more relevant. Note that processing_thread_pool threads can end up submitting to solr too, if solr_json_writer.thread_pool is full.
|
95
95
|
|
96
|
+
* `solr_writer.basic_auth_user`, `solr_writer.basic_auth_password`: Not set by default but when both are set the default writer is configured with basic auth.
|
97
|
+
|
96
98
|
|
97
99
|
### Dealing with MARC data
|
98
100
|
|
@@ -21,6 +21,9 @@ module Traject
|
|
21
21
|
# If you need to use namespaces here, you need to have them registered with
|
22
22
|
# `nokogiri.default_namespaces`. If your source docs use namespaces, you DO need
|
23
23
|
# to use them in your each_record_xpath.
|
24
|
+
# * nokogiri.strict_mode: if set to `true` or `"true"`, ask Nokogiri to parse in 'strict'
|
25
|
+
# mode, it will raise a `Nokogiri::XML::SyntaxError` if the XML is not well-formed, instead
|
26
|
+
# of trying to take it's best-guess correction. https://nokogiri.org/tutorials/ensuring_well_formed_markup.html
|
24
27
|
# * nokogiri_reader.extra_xpath_hooks: Experimental in progress, see below.
|
25
28
|
#
|
26
29
|
# ## nokogiri_reader.extra_xpath_hooks: For handling nodes outside of your each_record_xpath
|
@@ -87,7 +90,11 @@ module Traject
|
|
87
90
|
end
|
88
91
|
|
89
92
|
def each
|
90
|
-
|
93
|
+
config_proc = if settings["nokogiri.strict_mode"]
|
94
|
+
proc { |config| config.strict }
|
95
|
+
end
|
96
|
+
|
97
|
+
whole_input_doc = Nokogiri::XML.parse(input_stream, &config_proc)
|
91
98
|
|
92
99
|
if each_record_xpath
|
93
100
|
whole_input_doc.xpath(each_record_xpath, default_namespaces).each do |matching_node|
|
@@ -114,6 +114,12 @@ class Traject::SolrJsonWriter
|
|
114
114
|
if @settings["solr_writer.http_timeout"]
|
115
115
|
client.connect_timeout = client.receive_timeout = client.send_timeout = @settings["solr_writer.http_timeout"]
|
116
116
|
end
|
117
|
+
|
118
|
+
if @settings["solr_writer.basic_auth_user"] &&
|
119
|
+
@settings["solr_writer.basic_auth_password"]
|
120
|
+
client.set_auth(@settings["solr.url"], @settings["solr_writer.basic_auth_user"], @settings["solr_writer.basic_auth_password"])
|
121
|
+
end
|
122
|
+
|
117
123
|
client
|
118
124
|
end
|
119
125
|
|
@@ -270,6 +276,13 @@ class Traject::SolrJsonWriter
|
|
270
276
|
end
|
271
277
|
end
|
272
278
|
|
279
|
+
# Send a delete all query.
|
280
|
+
#
|
281
|
+
# This method takes no params and will not automatically commit the deletes.
|
282
|
+
# @example @writer.delete_all!
|
283
|
+
def delete_all!
|
284
|
+
delete(query: "*:*")
|
285
|
+
end
|
273
286
|
|
274
287
|
# Get the logger from the settings, or default to an effectively null logger
|
275
288
|
def logger
|
data/lib/traject/version.rb
CHANGED
@@ -134,6 +134,16 @@ describe "Traject::NokogiriReader" do
|
|
134
134
|
end
|
135
135
|
end
|
136
136
|
|
137
|
+
describe "strict_mode" do
|
138
|
+
it "raises on non-well-formed" do
|
139
|
+
# invalid because two sibling root nodes, XML requiers one root node
|
140
|
+
reader = Traject::NokogiriReader.new(StringIO.new("<doc></doc><doc></doc>"), {"nokogiri.strict_mode" => "true" })
|
141
|
+
assert_raises(Nokogiri::XML::SyntaxError) {
|
142
|
+
reader.each { |r| }
|
143
|
+
}
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
137
147
|
|
138
148
|
def shared_tests
|
139
149
|
@reader = Traject::NokogiriReader.new(File.open(@xml_sample_path), {
|
@@ -170,6 +170,27 @@ describe "Traject::SolrJsonWriter" do
|
|
170
170
|
assert_length 1, @fake_http_client.post_args, "Has flushed to solr"
|
171
171
|
end
|
172
172
|
|
173
|
+
it "defaults to not setting basic authentication" do
|
174
|
+
settings = { "solr.url" => "http://example.com/solr/foo" }
|
175
|
+
writer = Traject::SolrJsonWriter.new(settings)
|
176
|
+
auth = writer.instance_variable_get("@http_client")
|
177
|
+
.www_auth.basic_auth.instance_variable_get("@auth")
|
178
|
+
assert(auth.empty?)
|
179
|
+
end
|
180
|
+
|
181
|
+
it "allows basic authentication setup" do
|
182
|
+
settings = {
|
183
|
+
"solr.url" => "http://example.com/solr/foo",
|
184
|
+
"solr_writer.basic_auth_user" => "foo",
|
185
|
+
"solr_writer.basic_auth_password" => "bar",
|
186
|
+
}
|
187
|
+
|
188
|
+
writer = Traject::SolrJsonWriter.new(settings)
|
189
|
+
auth = writer.instance_variable_get("@http_client")
|
190
|
+
.www_auth.basic_auth.instance_variable_get("@auth")
|
191
|
+
assert(!auth.empty?)
|
192
|
+
end
|
193
|
+
|
173
194
|
describe "commit" do
|
174
195
|
it "commits on close when set" do
|
175
196
|
@writer = create_writer("solr.url" => "http://example.com", "solr_writer.commit_on_close" => "true")
|
@@ -365,4 +386,13 @@ describe "Traject::SolrJsonWriter" do
|
|
365
386
|
end
|
366
387
|
end
|
367
388
|
end
|
389
|
+
|
390
|
+
describe "#delete_all!" do
|
391
|
+
it "deletes all" do
|
392
|
+
@writer.delete_all!
|
393
|
+
post_args = @fake_http_client.post_args.first
|
394
|
+
assert_equal "http://example.com/solr/update/json", post_args[0]
|
395
|
+
assert_equal JSON.generate({"delete" => { "query" => "*:*"}}), post_args[1]
|
396
|
+
end
|
397
|
+
end
|
368
398
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: traject
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Rochkind
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2019-
|
12
|
+
date: 2019-09-10 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: concurrent-ruby
|