traject 3.1.0 → 3.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGES.md +19 -0
- data/doc/settings.md +2 -0
- data/lib/traject/nokogiri_reader.rb +8 -1
- data/lib/traject/solr_json_writer.rb +13 -0
- data/lib/traject/version.rb +1 -1
- data/test/nokogiri_reader_test.rb +10 -0
- data/test/solr_json_writer_test.rb +30 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9a69bdb470c759d08d117f910ec1c6b16cbe93dbb3e955b653a07e3c03efec68
|
4
|
+
data.tar.gz: 68016ddf17fe29348248e5f26ff5a813c1485c37a0ac014faa69803be182f7b4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8921a5d2349025291f28c7d39ebec44c0c0ebdc11a489e1d4c22913c04c840cf40163c576b14432794f1d663b9b81eb1d777b87d38af2b429f4b6ac98982ad48
|
7
|
+
data.tar.gz: 8bdb95f98baa11ee60c74a7a4bbab06c44fb5da466138d098b05e5ab3fddc28a1fe8926044be5350e5e0b8a91ccd2d6e2e2ecf78424cfc0430464dd0ccf79c66
|
data/CHANGES.md
CHANGED
@@ -1,5 +1,24 @@
|
|
1
1
|
# Changes
|
2
2
|
|
3
|
+
## Next
|
4
|
+
|
5
|
+
*
|
6
|
+
|
7
|
+
*
|
8
|
+
|
9
|
+
*
|
10
|
+
|
11
|
+
## 3.2.0
|
12
|
+
|
13
|
+
* NokogiriReader has a "nokogiri.strict_mode" setting. Set to true or string 'true' to ask Nokogori to parse in strict mode, so it will immediately raise on ill-formed XML, instead of nokogiri's default to do what it can with it. https://github.com/traject/traject/pull/226
|
14
|
+
|
15
|
+
* SolrJsonWriter
|
16
|
+
|
17
|
+
* Utility method `delete_all!` sends a delete all query to the Solr URL endpoint. https://github.com/traject/traject/pull/227
|
18
|
+
|
19
|
+
* Allow basic auth configuration of the default http client via `solr_writer.basic_auth_user` and `solr_writer.basic_auth_password`. https://github.com/traject/traject/pull/231
|
20
|
+
|
21
|
+
|
3
22
|
## 3.1.0
|
4
23
|
|
5
24
|
### Added
|
data/doc/settings.md
CHANGED
@@ -93,6 +93,8 @@ settings are applied first of all. It's recommended you use `provide`.
|
|
93
93
|
|
94
94
|
* `solr_writer.thread_pool`: defaults to 1 (single bg thread). A thread pool is used for submitting docs to solr. Set to 0 or nil to disable threading. Set to 1, there will still be a single bg thread doing the adds. May make sense to set higher than number of cores on your indexing machine, as these threads will mostly be waiting on Solr. Speed/capacity of your solr might be more relevant. Note that processing_thread_pool threads can end up submitting to solr too, if solr_json_writer.thread_pool is full.
|
95
95
|
|
96
|
+
* `solr_writer.basic_auth_user`, `solr_writer.basic_auth_password`: Not set by default but when both are set the default writer is configured with basic auth.
|
97
|
+
|
96
98
|
|
97
99
|
### Dealing with MARC data
|
98
100
|
|
@@ -21,6 +21,9 @@ module Traject
|
|
21
21
|
# If you need to use namespaces here, you need to have them registered with
|
22
22
|
# `nokogiri.default_namespaces`. If your source docs use namespaces, you DO need
|
23
23
|
# to use them in your each_record_xpath.
|
24
|
+
# * nokogiri.strict_mode: if set to `true` or `"true"`, ask Nokogiri to parse in 'strict'
|
25
|
+
# mode, it will raise a `Nokogiri::XML::SyntaxError` if the XML is not well-formed, instead
|
26
|
+
# of trying to take it's best-guess correction. https://nokogiri.org/tutorials/ensuring_well_formed_markup.html
|
24
27
|
# * nokogiri_reader.extra_xpath_hooks: Experimental in progress, see below.
|
25
28
|
#
|
26
29
|
# ## nokogiri_reader.extra_xpath_hooks: For handling nodes outside of your each_record_xpath
|
@@ -87,7 +90,11 @@ module Traject
|
|
87
90
|
end
|
88
91
|
|
89
92
|
def each
|
90
|
-
|
93
|
+
config_proc = if settings["nokogiri.strict_mode"]
|
94
|
+
proc { |config| config.strict }
|
95
|
+
end
|
96
|
+
|
97
|
+
whole_input_doc = Nokogiri::XML.parse(input_stream, &config_proc)
|
91
98
|
|
92
99
|
if each_record_xpath
|
93
100
|
whole_input_doc.xpath(each_record_xpath, default_namespaces).each do |matching_node|
|
@@ -114,6 +114,12 @@ class Traject::SolrJsonWriter
|
|
114
114
|
if @settings["solr_writer.http_timeout"]
|
115
115
|
client.connect_timeout = client.receive_timeout = client.send_timeout = @settings["solr_writer.http_timeout"]
|
116
116
|
end
|
117
|
+
|
118
|
+
if @settings["solr_writer.basic_auth_user"] &&
|
119
|
+
@settings["solr_writer.basic_auth_password"]
|
120
|
+
client.set_auth(@settings["solr.url"], @settings["solr_writer.basic_auth_user"], @settings["solr_writer.basic_auth_password"])
|
121
|
+
end
|
122
|
+
|
117
123
|
client
|
118
124
|
end
|
119
125
|
|
@@ -270,6 +276,13 @@ class Traject::SolrJsonWriter
|
|
270
276
|
end
|
271
277
|
end
|
272
278
|
|
279
|
+
# Send a delete all query.
|
280
|
+
#
|
281
|
+
# This method takes no params and will not automatically commit the deletes.
|
282
|
+
# @example @writer.delete_all!
|
283
|
+
def delete_all!
|
284
|
+
delete(query: "*:*")
|
285
|
+
end
|
273
286
|
|
274
287
|
# Get the logger from the settings, or default to an effectively null logger
|
275
288
|
def logger
|
data/lib/traject/version.rb
CHANGED
@@ -134,6 +134,16 @@ describe "Traject::NokogiriReader" do
|
|
134
134
|
end
|
135
135
|
end
|
136
136
|
|
137
|
+
describe "strict_mode" do
|
138
|
+
it "raises on non-well-formed" do
|
139
|
+
# invalid because two sibling root nodes, XML requiers one root node
|
140
|
+
reader = Traject::NokogiriReader.new(StringIO.new("<doc></doc><doc></doc>"), {"nokogiri.strict_mode" => "true" })
|
141
|
+
assert_raises(Nokogiri::XML::SyntaxError) {
|
142
|
+
reader.each { |r| }
|
143
|
+
}
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
137
147
|
|
138
148
|
def shared_tests
|
139
149
|
@reader = Traject::NokogiriReader.new(File.open(@xml_sample_path), {
|
@@ -170,6 +170,27 @@ describe "Traject::SolrJsonWriter" do
|
|
170
170
|
assert_length 1, @fake_http_client.post_args, "Has flushed to solr"
|
171
171
|
end
|
172
172
|
|
173
|
+
it "defaults to not setting basic authentication" do
|
174
|
+
settings = { "solr.url" => "http://example.com/solr/foo" }
|
175
|
+
writer = Traject::SolrJsonWriter.new(settings)
|
176
|
+
auth = writer.instance_variable_get("@http_client")
|
177
|
+
.www_auth.basic_auth.instance_variable_get("@auth")
|
178
|
+
assert(auth.empty?)
|
179
|
+
end
|
180
|
+
|
181
|
+
it "allows basic authentication setup" do
|
182
|
+
settings = {
|
183
|
+
"solr.url" => "http://example.com/solr/foo",
|
184
|
+
"solr_writer.basic_auth_user" => "foo",
|
185
|
+
"solr_writer.basic_auth_password" => "bar",
|
186
|
+
}
|
187
|
+
|
188
|
+
writer = Traject::SolrJsonWriter.new(settings)
|
189
|
+
auth = writer.instance_variable_get("@http_client")
|
190
|
+
.www_auth.basic_auth.instance_variable_get("@auth")
|
191
|
+
assert(!auth.empty?)
|
192
|
+
end
|
193
|
+
|
173
194
|
describe "commit" do
|
174
195
|
it "commits on close when set" do
|
175
196
|
@writer = create_writer("solr.url" => "http://example.com", "solr_writer.commit_on_close" => "true")
|
@@ -365,4 +386,13 @@ describe "Traject::SolrJsonWriter" do
|
|
365
386
|
end
|
366
387
|
end
|
367
388
|
end
|
389
|
+
|
390
|
+
describe "#delete_all!" do
|
391
|
+
it "deletes all" do
|
392
|
+
@writer.delete_all!
|
393
|
+
post_args = @fake_http_client.post_args.first
|
394
|
+
assert_equal "http://example.com/solr/update/json", post_args[0]
|
395
|
+
assert_equal JSON.generate({"delete" => { "query" => "*:*"}}), post_args[1]
|
396
|
+
end
|
397
|
+
end
|
368
398
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: traject
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Rochkind
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2019-
|
12
|
+
date: 2019-09-10 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: concurrent-ruby
|