traject 2.3.4 → 3.0.0.alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.travis.yml +16 -9
- data/CHANGES.md +74 -1
- data/Gemfile +2 -1
- data/README.md +104 -53
- data/Rakefile +8 -1
- data/doc/indexing_rules.md +79 -63
- data/doc/programmatic_use.md +218 -0
- data/doc/settings.md +28 -1
- data/doc/xml.md +134 -0
- data/lib/traject.rb +5 -0
- data/lib/traject/array_writer.rb +34 -0
- data/lib/traject/command_line.rb +18 -22
- data/lib/traject/debug_writer.rb +2 -5
- data/lib/traject/experimental_nokogiri_streaming_reader.rb +276 -0
- data/lib/traject/hashie/indifferent_access_fix.rb +25 -0
- data/lib/traject/indexer.rb +321 -92
- data/lib/traject/indexer/context.rb +39 -13
- data/lib/traject/indexer/marc_indexer.rb +30 -0
- data/lib/traject/indexer/nokogiri_indexer.rb +30 -0
- data/lib/traject/indexer/settings.rb +36 -53
- data/lib/traject/indexer/step.rb +27 -33
- data/lib/traject/macros/marc21.rb +37 -12
- data/lib/traject/macros/nokogiri_macros.rb +43 -0
- data/lib/traject/macros/transformation.rb +162 -0
- data/lib/traject/marc_extractor.rb +2 -0
- data/lib/traject/ndj_reader.rb +1 -1
- data/lib/traject/nokogiri_reader.rb +179 -0
- data/lib/traject/oai_pmh_nokogiri_reader.rb +159 -0
- data/lib/traject/solr_json_writer.rb +19 -12
- data/lib/traject/thread_pool.rb +13 -0
- data/lib/traject/util.rb +14 -2
- data/lib/traject/version.rb +1 -1
- data/test/debug_writer_test.rb +3 -3
- data/test/delimited_writer_test.rb +3 -3
- data/test/experimental_nokogiri_streaming_reader_test.rb +169 -0
- data/test/indexer/context_test.rb +23 -13
- data/test/indexer/error_handler_test.rb +59 -0
- data/test/indexer/macros/macros_marc21_semantics_test.rb +46 -46
- data/test/indexer/macros/marc21/extract_all_marc_values_test.rb +1 -1
- data/test/indexer/macros/marc21/extract_marc_test.rb +19 -9
- data/test/indexer/macros/marc21/serialize_marc_test.rb +4 -4
- data/test/indexer/macros/to_field_test.rb +2 -2
- data/test/indexer/macros/transformation_test.rb +177 -0
- data/test/indexer/map_record_test.rb +2 -3
- data/test/indexer/nokogiri_indexer_test.rb +103 -0
- data/test/indexer/process_record_test.rb +55 -0
- data/test/indexer/process_with_test.rb +148 -0
- data/test/indexer/read_write_test.rb +52 -2
- data/test/indexer/settings_test.rb +34 -24
- data/test/indexer/to_field_test.rb +27 -2
- data/test/marc_extractor_test.rb +7 -7
- data/test/marc_reader_test.rb +4 -4
- data/test/nokogiri_reader_test.rb +158 -0
- data/test/oai_pmh_nokogiri_reader_test.rb +23 -0
- data/test/solr_json_writer_test.rb +24 -28
- data/test/test_helper.rb +8 -2
- data/test/test_support/namespace-test.xml +7 -0
- data/test/test_support/nokogiri_demo_config.rb +17 -0
- data/test/test_support/oai-pmh-one-record-2.xml +24 -0
- data/test/test_support/oai-pmh-one-record-first.xml +24 -0
- data/test/test_support/sample-oai-no-namespace.xml +197 -0
- data/test/test_support/sample-oai-pmh.xml +197 -0
- data/test/thread_pool_test.rb +38 -0
- data/test/translation_map_test.rb +3 -3
- data/test/translation_maps/ruby_map.rb +2 -1
- data/test/translation_maps/yaml_map.yaml +2 -1
- data/traject.gemspec +4 -11
- metadata +92 -6
@@ -0,0 +1,43 @@
|
|
1
|
+
module Traject
|
2
|
+
module Macros
|
3
|
+
module NokogiriMacros
|
4
|
+
|
5
|
+
def default_namespaces
|
6
|
+
@default_namespaces ||= (settings["nokogiri.namespaces"] || {}).tap { |ns|
|
7
|
+
unless ns.kind_of?(Hash)
|
8
|
+
raise ArgumentError, "nokogiri.namespaces must be a hash, not: #{ns.inspect}"
|
9
|
+
end
|
10
|
+
}
|
11
|
+
end
|
12
|
+
|
13
|
+
def extract_xpath(xpath, ns: {}, to_text: true)
|
14
|
+
if ns && ns.length > 0
|
15
|
+
namespaces = default_namespaces.merge(ns)
|
16
|
+
else
|
17
|
+
namespaces = default_namespaces
|
18
|
+
end
|
19
|
+
|
20
|
+
lambda do |record, accumulator|
|
21
|
+
result = record.xpath(xpath, namespaces)
|
22
|
+
|
23
|
+
if to_text
|
24
|
+
# take all matches, for each match take all
|
25
|
+
# text content, join it together separated with spaces
|
26
|
+
# Make sure to avoid text content that was all blank, which is "between the children"
|
27
|
+
# whitespace.
|
28
|
+
result = result.collect do |n|
|
29
|
+
n.xpath('.//text()').collect(&:text).tap do |arr|
|
30
|
+
arr.reject! { |s| s =~ (/\A\s+\z/) }
|
31
|
+
end.join(" ")
|
32
|
+
end
|
33
|
+
else
|
34
|
+
# just put all matches in accumulator as Nokogiri::XML::Node's
|
35
|
+
result = result.to_a
|
36
|
+
end
|
37
|
+
|
38
|
+
accumulator.concat result
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,162 @@
|
|
1
|
+
module Traject
|
2
|
+
module Macros
|
3
|
+
# Macros intended to be mixed into an Indexer and used in config
|
4
|
+
# as second or further args to #to_field, to transform existing accumulator values.
|
5
|
+
#
|
6
|
+
# They have the same form as any proc/block passed to #to_field, but
|
7
|
+
# operate on an existing accumulator, intended to be used as non-first-step
|
8
|
+
# transformations.
|
9
|
+
#
|
10
|
+
# Some of these are extracted from extract_marc options, so they can be
|
11
|
+
# used with any first-step extract methods. Some informed by current users.
|
12
|
+
module Transformation
|
13
|
+
|
14
|
+
# Maps all values on accumulator through a Traject::TranslationMap.
|
15
|
+
#
|
16
|
+
# A Traject::TranslationMap is hash-like mapping from input to output, usually
|
17
|
+
# defined in a yaml or dot-properties file, which can be looked up in load path
|
18
|
+
# with a file name as arg. See [Traject::TranslationMap](../translation_map.rb)
|
19
|
+
# header coments for details.
|
20
|
+
#
|
21
|
+
# Using this macro, you can pass in one TranslationMap initializer arg, but you can
|
22
|
+
# also pass in multiple, and they will be merged into each other (last one last), so
|
23
|
+
# you can use this to apply over-rides: Either from another on-disk map, or even from
|
24
|
+
# an inline hash (since a Hash is a valid TranslationMap initialization arg too).
|
25
|
+
#
|
26
|
+
# @example
|
27
|
+
# to_field("something"), to_field "cataloging_agency", extract_marc("040a"), translation_map("marc_040a")
|
28
|
+
#
|
29
|
+
# @example with override
|
30
|
+
# to_field("something"), to_field "cataloging_agency", extract_marc("040a"), translation_map("marc_040a", "local_marc_040a")
|
31
|
+
#
|
32
|
+
# @example with multiple overrides, including local hash
|
33
|
+
# to_field("something"), to_field "cataloging_agency", extract_marc("040a"), translation_map("marc_040a", "local_marc_040a", {"DLC" => "U.S. LoC"})
|
34
|
+
def translation_map(*translation_map_specifier)
|
35
|
+
translation_map = translation_map_specifier.
|
36
|
+
collect { |spec| Traject::TranslationMap.new(spec) }.
|
37
|
+
reduce(:merge)
|
38
|
+
|
39
|
+
lambda do |rec, acc|
|
40
|
+
translation_map.translate_array! acc
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Pass in a proc/lambda arg or a block (or both), that will be called on each
|
45
|
+
# value already in the accumulator, to transform it. (Ie, with `#map!`/`#collect!` on your proc(s)).
|
46
|
+
#
|
47
|
+
# Due to how ruby syntax precedence works, the block form is probably not too useful
|
48
|
+
# in traject config files, except with the `&:` trick.
|
49
|
+
#
|
50
|
+
# The "stabby lambda" may be convenient for passing an explicit proc argument.
|
51
|
+
#
|
52
|
+
# You can pass both an explicit proc arg and a block, in which case the proc arg
|
53
|
+
# will be applied first.
|
54
|
+
#
|
55
|
+
# @example
|
56
|
+
# to_field("something"), extract_marc("something"), transform(&:upcase)
|
57
|
+
#
|
58
|
+
# @example
|
59
|
+
# to_field("something"), extract_marc("something"), transform(->(val) { val.tr('^a-z', "\uFFFD") })
|
60
|
+
def transform(a_proc=nil, &block)
|
61
|
+
unless a_proc || block
|
62
|
+
raise ArgumentError, "Needs a transform proc arg or block arg"
|
63
|
+
end
|
64
|
+
|
65
|
+
transformer_callable = if a_proc && block
|
66
|
+
# need to make a combo wrapper.
|
67
|
+
->(val) { block.call(a_proc.call(val)) }
|
68
|
+
elsif a_proc
|
69
|
+
a_proc
|
70
|
+
else
|
71
|
+
block
|
72
|
+
end
|
73
|
+
|
74
|
+
lambda do |rec, acc|
|
75
|
+
acc.collect! do |value|
|
76
|
+
transformer_callable.call(value)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# Adds a literal to accumulator if accumulator was empty
|
82
|
+
#
|
83
|
+
# @example
|
84
|
+
# to_field "title", extract_marc("245abc"), default("Unknown Title")
|
85
|
+
def default(default_value)
|
86
|
+
lambda do |rec, acc|
|
87
|
+
if acc.empty?
|
88
|
+
acc << default_value
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
# Removes all but the first value from accumulator, if more values were present.
|
94
|
+
#
|
95
|
+
# @example
|
96
|
+
# to_field "main_author", extract_marc("100"), first_only
|
97
|
+
def first_only
|
98
|
+
lambda do |rec, acc|
|
99
|
+
# kind of esoteric, but slice used this way does mutating first, yep
|
100
|
+
acc.slice!(1, acc.length)
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
|
105
|
+
# calls ruby `uniq!` on accumulator, removes any duplicate values
|
106
|
+
#
|
107
|
+
# @example
|
108
|
+
# to_field "something", extract_marc("245:240"), unique
|
109
|
+
def unique
|
110
|
+
lambda do |rec, acc|
|
111
|
+
acc.uniq!
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
|
116
|
+
# For each value in accumulator, remove all leading or trailing whitespace
|
117
|
+
# (unique aware). Like ruby #strip, but whitespace-aware
|
118
|
+
#
|
119
|
+
# @example
|
120
|
+
# to_field "title", extract_marc("245"), strip
|
121
|
+
def strip
|
122
|
+
lambda do |rec, acc|
|
123
|
+
acc.collect! do |v|
|
124
|
+
# unicode whitespace class aware
|
125
|
+
v.sub(/\A[[:space:]]+/,'').sub(/[[:space:]]+\Z/, '')
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
# Run ruby `split` on each value in the accumulator, with separator
|
131
|
+
# given, flatten all results into single array as accumulator.
|
132
|
+
# Will generally result in more individual values in accumulator as output than were
|
133
|
+
# there in input, as input values are split up into multiple values.
|
134
|
+
def split(separator)
|
135
|
+
lambda do |rec, acc|
|
136
|
+
acc.replace( acc.flat_map { |v| v.split(separator) } )
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
# Append argument to end of each value in accumulator.
|
141
|
+
def append(suffix)
|
142
|
+
lambda do |rec, acc|
|
143
|
+
acc.collect! { |v| v + suffix }
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
# prepend argument to beginning of each value in accumulator.
|
148
|
+
def prepend(prefix)
|
149
|
+
lambda do |rec, acc|
|
150
|
+
acc.collect! { |v| prefix + v }
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
# Run ruby `gsub` on each value in accumulator, with pattern and replace value given.
|
155
|
+
def gsub(pattern, replace)
|
156
|
+
lambda do |rec, acc|
|
157
|
+
acc.collect! { |v| v.gsub(pattern, replace) }
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
@@ -151,6 +151,8 @@ module Traject
|
|
151
151
|
if options[:alternate_script] != false
|
152
152
|
@fetch_alternate_script = true
|
153
153
|
show_interest_in_tag(ALTERNATE_SCRIPT_TAG)
|
154
|
+
else
|
155
|
+
@fetch_alternate_script = false
|
154
156
|
end
|
155
157
|
|
156
158
|
@interesting_tags_list = @interesting_tags_hash.keys
|
data/lib/traject/ndj_reader.rb
CHANGED
@@ -12,7 +12,7 @@ class Traject::NDJReader
|
|
12
12
|
def initialize(input_stream, settings)
|
13
13
|
@settings = settings
|
14
14
|
@input_stream = input_stream
|
15
|
-
if /\.gz\Z/.match(
|
15
|
+
if input_stream.respond_to?(:path) && /\.gz\Z/.match(input_stream.path)
|
16
16
|
@input_stream = Zlib::GzipReader.new(@input_stream, :external_encoding => "UTF-8")
|
17
17
|
end
|
18
18
|
end
|
@@ -0,0 +1,179 @@
|
|
1
|
+
module Traject
|
2
|
+
# A Trajet reader which reads XML, and yields zero to many Nokogiri::XML::Document
|
3
|
+
# objects as source records in the traject pipeline.
|
4
|
+
#
|
5
|
+
# It does process the entire input document with Nokogiri::XML.parse, DOM-parsing,
|
6
|
+
# so will take RAM for the entire input document, until iteration completes.
|
7
|
+
# (There is a separate half-finished `ExperimentalStreamingNokogiriReader` available, but it is
|
8
|
+
# experimental, half-finished, may disappear or change in backwards compat at any time, problematic,
|
9
|
+
# not recommended for production use, etc.)
|
10
|
+
#
|
11
|
+
# You can have it yield the _entire_ input XML as a single traject source record
|
12
|
+
# (default), or you can use setting `nokogiri.each_record_xpath` to split
|
13
|
+
# the source up into separate records to yield into traject pipeline -- each one
|
14
|
+
# will be it's own Nokogiri::XML::Document.
|
15
|
+
#
|
16
|
+
# ## Settings
|
17
|
+
# * nokogiri.default_namespaces: Set namespace prefixes that can be used in
|
18
|
+
# other settings, including `extract_xpath` from NokogiriMacros.
|
19
|
+
# * nokogiri.each_record_xpath: if set to a string xpath, will take all matching nodes
|
20
|
+
# from the input doc, and yield the individually as source records to the pipeline.
|
21
|
+
# If you need to use namespaces here, you need to have them registered with
|
22
|
+
# `nokogiri.default_namespaces`. If your source docs use namespaces, you DO need
|
23
|
+
# to use them in your each_record_xpath.
|
24
|
+
# * nokogiri_reader.extra_xpath_hooks: Experimental in progress, see below.
|
25
|
+
#
|
26
|
+
# ## nokogiri_reader.extra_xpath_hooks: For handling nodes outside of your each_record_xpath
|
27
|
+
#
|
28
|
+
# What if you want to use each_record_xpath to yield certain nodes as source documents, but
|
29
|
+
# there is additional some other info in other parts of the input document you need? This came up
|
30
|
+
# when developing the OaiPmhNokogiriReader, which yields "//oai:record" as pipeline source documents,
|
31
|
+
# but also needed to look at "//oai:resumptionToken" to scrape the entire results.
|
32
|
+
#
|
33
|
+
# There is a semi-finished/in-progress feature that meets that use case -- unclear if it will meet
|
34
|
+
# all use cases for this general issue.
|
35
|
+
#
|
36
|
+
# Setting `nokogiri_reader.extra_xpath_hooks` can be set to a Hash where the keys are xpaths (if using
|
37
|
+
# namespaces must be must be registered with `nokogiri.default_namespaces`), and the value is a lambda/
|
38
|
+
# proc/callable object, taking two arguments.
|
39
|
+
#
|
40
|
+
# provide "nokogiri_reader.extra_xpath_hooks", {
|
41
|
+
# "//oai:resumptionToken" =>
|
42
|
+
# lambda do |node, clipboard|
|
43
|
+
# clipboard[:resumption_token] = node.text
|
44
|
+
# end"
|
45
|
+
# }
|
46
|
+
#
|
47
|
+
# The first arg is the matching node. What's this clipboard? Well, what are you
|
48
|
+
# gonna _do_ with what you get out of there, that you can do in a thread-safe way
|
49
|
+
# in the middle of nokogiri processing? The second arg is a thread-safe Hash "clipboard"
|
50
|
+
# that you can store things in, and later access via reader.clipboard.
|
51
|
+
#
|
52
|
+
# There's no great thread-safe way to get reader.clipboard in a normal nokogiri pipeline though,
|
53
|
+
# (the reader can change in multi-file handling so there can be a race condition if you try naively,
|
54
|
+
# don't!) Which is why this feature needs some work for general applicability. The OaiPmhReader
|
55
|
+
# manually creates it's readers outside the usual nokogiri flow, so can use it.
|
56
|
+
class NokogiriReader
|
57
|
+
include Enumerable
|
58
|
+
|
59
|
+
attr_reader :settings, :input_stream, :clipboard, :path_tracker
|
60
|
+
|
61
|
+
def initialize(input_stream, settings)
|
62
|
+
@settings = Traject::Indexer::Settings.new settings
|
63
|
+
@input_stream = input_stream
|
64
|
+
@clipboard = Traject::Util.is_jruby? ? Concurrent::Map.new : Concurrent::Hash.new
|
65
|
+
|
66
|
+
default_namespaces # trigger validation
|
67
|
+
validate_xpath(each_record_xpath, key_name: "each_record_xpath") if each_record_xpath
|
68
|
+
extra_xpath_hooks.each_pair do |xpath, _callable|
|
69
|
+
validate_xpath(xpath, key_name: "extra_xpath_hooks")
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def each_record_xpath
|
74
|
+
@each_record_xpath ||= settings["nokogiri.each_record_xpath"]
|
75
|
+
end
|
76
|
+
|
77
|
+
def extra_xpath_hooks
|
78
|
+
@extra_xpath_hooks ||= settings["nokogiri_reader.extra_xpath_hooks"] || {}
|
79
|
+
end
|
80
|
+
|
81
|
+
def default_namespaces
|
82
|
+
@default_namespaces ||= (settings["nokogiri.namespaces"] || {}).tap { |ns|
|
83
|
+
unless ns.kind_of?(Hash)
|
84
|
+
raise ArgumentError, "nokogiri.namespaces must be a hash, not: #{ns.inspect}"
|
85
|
+
end
|
86
|
+
}
|
87
|
+
end
|
88
|
+
|
89
|
+
def each
|
90
|
+
whole_input_doc = Nokogiri::XML.parse(input_stream)
|
91
|
+
|
92
|
+
if each_record_xpath
|
93
|
+
whole_input_doc.xpath(each_record_xpath, default_namespaces).each do |matching_node|
|
94
|
+
# We want to take the matching node, and make it into root in a new Nokogiri document.
|
95
|
+
# This is tricky to do as performant as possible (we want to re-use the existing libxml node),
|
96
|
+
# while preserving namespaces properly (especially in jruby). Some uses of noko api that seem
|
97
|
+
# like they should work don't, esp in jruby.
|
98
|
+
child_doc = Nokogiri::XML::Document.new
|
99
|
+
|
100
|
+
reparent_node_to_root(child_doc, matching_node)
|
101
|
+
|
102
|
+
yield child_doc
|
103
|
+
|
104
|
+
child_doc = nil # hopefully make things easier on the GC.
|
105
|
+
end
|
106
|
+
else
|
107
|
+
# caller wants whole doc as a traject source record
|
108
|
+
yield whole_input_doc
|
109
|
+
end
|
110
|
+
|
111
|
+
run_extra_xpath_hooks(whole_input_doc)
|
112
|
+
|
113
|
+
ensure
|
114
|
+
# hopefully make things easier on the GC.
|
115
|
+
whole_input_doc = nil
|
116
|
+
end
|
117
|
+
|
118
|
+
private
|
119
|
+
|
120
|
+
|
121
|
+
# In MRI Nokogiri, this is as simple as `new_parent_doc.root = node`
|
122
|
+
# It seemed maybe safer to dup the node as well as remove the original from the original doc,
|
123
|
+
# but I believe this will result in double memory usage, as unlinked nodes aren't GC'd until
|
124
|
+
# their doc is. I am hoping this pattern results in less memory usage.
|
125
|
+
# https://github.com/sparklemotion/nokogiri/issues/1703
|
126
|
+
#
|
127
|
+
# However, in JRuby it's a different story, JRuby doesn't properly preserve namespaces
|
128
|
+
# when re-parenting a node.
|
129
|
+
# https://github.com/sparklemotion/nokogiri/issues/1774
|
130
|
+
#
|
131
|
+
# The nodes within the tree re-parented _know_ they are in the correct namespaces,
|
132
|
+
# and xpath queries require that namespace, but the appropriate xmlns attributes
|
133
|
+
# aren't included in the serialized XML. This JRuby-specific code seems to get
|
134
|
+
# things back to a consistent state.
|
135
|
+
def reparent_node_to_root(new_parent_doc, node)
|
136
|
+
if Traject::Util.is_jruby?
|
137
|
+
original_ns_scopes = node.namespace_scopes
|
138
|
+
end
|
139
|
+
|
140
|
+
new_parent_doc.root = node
|
141
|
+
|
142
|
+
if Traject::Util.is_jruby?
|
143
|
+
original_ns_scopes.each do |ns|
|
144
|
+
if new_parent_doc.at_xpath("//#{ns.prefix}:*", ns.prefix => ns.href)
|
145
|
+
new_parent_doc.root.add_namespace(ns.prefix, ns.href)
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
return new_parent_doc
|
151
|
+
end
|
152
|
+
|
153
|
+
def validate_xpath(xpath, key_name:)
|
154
|
+
components = each_record_xpath.split('/')
|
155
|
+
components.each do |component|
|
156
|
+
prefix, element = component.split(':')
|
157
|
+
unless element
|
158
|
+
# there was no namespace
|
159
|
+
prefix, element = nil, prefix
|
160
|
+
end
|
161
|
+
|
162
|
+
if prefix
|
163
|
+
ns_uri = default_namespaces[prefix]
|
164
|
+
if ns_uri.nil?
|
165
|
+
raise ArgumentError, "#{key_name}: Can't find namespace prefix '#{prefix}' in '#{each_record_xpath}'. To use a namespace in each_record_xpath, it has to be registered with nokogiri.namespaces: #{default_namespaces.inspect}"
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
def run_extra_xpath_hooks(noko_doc)
|
172
|
+
extra_xpath_hooks.each_pair do |xpath, callable|
|
173
|
+
noko_doc.xpath(xpath, default_namespaces).each do |matching_node|
|
174
|
+
callable.call(matching_node, clipboard)
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
@@ -0,0 +1,159 @@
|
|
1
|
+
require 'uri'
|
2
|
+
require 'cgi'
|
3
|
+
require 'http'
|
4
|
+
|
5
|
+
module Traject
|
6
|
+
# Reads an OAI feed via HTTP and feeds it directly to a traject pipeline. You don't HAVE to use
|
7
|
+
# this to read oai-pmh, you might choose to fetch and store OAI-PMH responses to disk yourself,
|
8
|
+
# and then process as ordinary XML.
|
9
|
+
#
|
10
|
+
# Example command line:
|
11
|
+
#
|
12
|
+
# traject -i xml -r Traject::OaiPmhNokogiriReader -s oai_pmh.start_url="http://example.com/oai?verb=ListRecords&metadataPrefix=oai_dc" -c your_config.rb
|
13
|
+
#
|
14
|
+
# ## Settings
|
15
|
+
#
|
16
|
+
# * oai_pmh.start_url: Required, eg "http://example.com/oai?verb=ListRecords&metadataPrefix=oai_dc"
|
17
|
+
# * oai_pmh.timeout: (default 10) timeout for http.rb in seconds
|
18
|
+
# * oai_pmh.try_gzip: (default true). Ask server for gzip response if available
|
19
|
+
# * oai_pmh.http_persistent: (default true). Use persistent HTTP connections.
|
20
|
+
#
|
21
|
+
# ## JRUBY NOTES:
|
22
|
+
# * Does not work with jruby 9.2 until http.rb does: https://github.com/httprb/http/issues/475
|
23
|
+
# * JRuby version def reads whole http response into memory before parsing; MRI version might do this too, but might not?
|
24
|
+
#
|
25
|
+
# ## TO DO
|
26
|
+
#
|
27
|
+
# This would be a lot more useful with some sort of built-in HTTP caching.
|
28
|
+
class OaiPmhNokogiriReader
|
29
|
+
include Enumerable
|
30
|
+
|
31
|
+
attr_reader :settings, :input_stream
|
32
|
+
|
33
|
+
def initialize(input_stream, settings)
|
34
|
+
namespaces = (settings["nokogiri.namespaces"] || {}).merge(
|
35
|
+
"oai" => "http://www.openarchives.org/OAI/2.0/"
|
36
|
+
)
|
37
|
+
|
38
|
+
|
39
|
+
@settings = Traject::Indexer::Settings.new(
|
40
|
+
"nokogiri_reader.extra_xpath_hooks" => extra_xpath_hooks,
|
41
|
+
"nokogiri.each_record_xpath" => "/oai:OAI-PMH/oai:ListRecords/oai:record",
|
42
|
+
"nokogiri.namespaces" => namespaces
|
43
|
+
).with_defaults(
|
44
|
+
"oai_pmh.timeout" => 10,
|
45
|
+
"oai_pmh.try_gzip" => true,
|
46
|
+
"oai_pmh.http_persistent" => true
|
47
|
+
).fill_in_defaults!.merge(settings)
|
48
|
+
|
49
|
+
@input_stream = input_stream
|
50
|
+
end
|
51
|
+
|
52
|
+
def start_url
|
53
|
+
settings["oai_pmh.start_url"] or raise ArgumentError.new("#{self.class.name} needs a setting 'oai_pmh.start_url'")
|
54
|
+
end
|
55
|
+
|
56
|
+
def start_url_verb
|
57
|
+
@start_url_verb ||= (array = CGI.parse(URI.parse(start_url).query)["verb"]) && array.first
|
58
|
+
end
|
59
|
+
|
60
|
+
def extra_xpath_hooks
|
61
|
+
@extra_xpath_hooks ||= {
|
62
|
+
"//oai:resumptionToken" =>
|
63
|
+
lambda do |doc, clipboard|
|
64
|
+
token = doc.text
|
65
|
+
if token && token != ""
|
66
|
+
clipboard[:resumption_token] = token
|
67
|
+
end
|
68
|
+
end
|
69
|
+
}
|
70
|
+
end
|
71
|
+
|
72
|
+
def each
|
73
|
+
url = start_url
|
74
|
+
|
75
|
+
resumption_token = nil
|
76
|
+
last_resumption_token = nil
|
77
|
+
pages_fetched = 0
|
78
|
+
|
79
|
+
until url == nil
|
80
|
+
resumption_token = read_and_parse_response(url) do |record|
|
81
|
+
yield record
|
82
|
+
end
|
83
|
+
url = resumption_url(resumption_token)
|
84
|
+
(last_resumption_token = resumption_token) if resumption_token
|
85
|
+
pages_fetched += 1
|
86
|
+
end
|
87
|
+
|
88
|
+
logger.info("#{self.class.name}: fetched #{pages_fetched} pages; last resumptionToken found: #{last_resumption_token.inspect}")
|
89
|
+
end
|
90
|
+
|
91
|
+
def resumption_url(resumption_token)
|
92
|
+
return nil if resumption_token.nil? || resumption_token == ""
|
93
|
+
|
94
|
+
# resumption URL is just original verb with resumption token, that seems to be
|
95
|
+
# the oai-pmh spec.
|
96
|
+
parsed_uri = URI.parse(start_url)
|
97
|
+
parsed_uri.query = "verb=#{CGI.escape start_url_verb}&resumptionToken=#{CGI.escape resumption_token}"
|
98
|
+
parsed_uri.to_s
|
99
|
+
end
|
100
|
+
|
101
|
+
def timeout
|
102
|
+
settings["oai_pmh.timeout"]
|
103
|
+
end
|
104
|
+
|
105
|
+
def logger
|
106
|
+
@logger ||= (@settings[:logger] || Yell.new(STDERR, :level => "gt.fatal")) # null logger)
|
107
|
+
end
|
108
|
+
|
109
|
+
private
|
110
|
+
|
111
|
+
# re-use an http-client for subsequent requests, to get http.rb's persistent connection re-use
|
112
|
+
# Note this means this is NOT thread safe, which is fine for now, but we'd have to do something
|
113
|
+
# different if we tried to multi-thread reading multiple files or something.
|
114
|
+
#
|
115
|
+
# @returns [HTTP::Client] from http.rb gem
|
116
|
+
def http_client
|
117
|
+
@http_client ||= begin
|
118
|
+
# timeout setting on http.rb seems to be a mess.
|
119
|
+
# https://github.com/httprb/http/issues/488
|
120
|
+
client = HTTP.timeout(:global, write: timeout / 3, connect: timeout / 3, read: timeout / 3)
|
121
|
+
|
122
|
+
if settings["oai_pmh.try_gzip"]
|
123
|
+
client = client.use(:auto_inflate).headers("accept-encoding" => "gzip;q=1.0, identity;q=0.5")
|
124
|
+
end
|
125
|
+
|
126
|
+
if settings["oai_pmh.http_persistent"]
|
127
|
+
parsed_uri = URI.parse(start_url)
|
128
|
+
client = client.persistent("#{parsed_uri.scheme}://#{parsed_uri.host}")
|
129
|
+
end
|
130
|
+
|
131
|
+
client
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
def read_and_parse_response(url)
|
136
|
+
http_response = http_client.get(url)
|
137
|
+
|
138
|
+
#File.write("our_oai/#{Time.now.to_i}.xml", body)
|
139
|
+
|
140
|
+
# Not sure why JRuby Nokogiri requires us to call #to_s on it first;
|
141
|
+
# not sure if this has perf implications. In either case, not sure
|
142
|
+
# if we are reading a separate copy of response into memory, or if Noko
|
143
|
+
# consumes it streaming. Trying to explicitly stream it to nokogiri, using
|
144
|
+
# http.rb#readpartial, just gave us a big headache.
|
145
|
+
noko_source_arg = if Traject::Util.is_jruby?
|
146
|
+
http_response.body.to_s
|
147
|
+
else
|
148
|
+
http_response.body
|
149
|
+
end
|
150
|
+
|
151
|
+
reader = Traject::NokogiriReader.new(noko_source_arg, settings)
|
152
|
+
|
153
|
+
reader.each { |d| yield d }
|
154
|
+
|
155
|
+
return reader.clipboard[:resumption_token]
|
156
|
+
end
|
157
|
+
|
158
|
+
end
|
159
|
+
end
|