traject 2.3.4 → 3.0.0.alpha.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.travis.yml +16 -9
- data/CHANGES.md +74 -1
- data/Gemfile +2 -1
- data/README.md +104 -53
- data/Rakefile +8 -1
- data/doc/indexing_rules.md +79 -63
- data/doc/programmatic_use.md +218 -0
- data/doc/settings.md +28 -1
- data/doc/xml.md +134 -0
- data/lib/traject.rb +5 -0
- data/lib/traject/array_writer.rb +34 -0
- data/lib/traject/command_line.rb +18 -22
- data/lib/traject/debug_writer.rb +2 -5
- data/lib/traject/experimental_nokogiri_streaming_reader.rb +276 -0
- data/lib/traject/hashie/indifferent_access_fix.rb +25 -0
- data/lib/traject/indexer.rb +321 -92
- data/lib/traject/indexer/context.rb +39 -13
- data/lib/traject/indexer/marc_indexer.rb +30 -0
- data/lib/traject/indexer/nokogiri_indexer.rb +30 -0
- data/lib/traject/indexer/settings.rb +36 -53
- data/lib/traject/indexer/step.rb +27 -33
- data/lib/traject/macros/marc21.rb +37 -12
- data/lib/traject/macros/nokogiri_macros.rb +43 -0
- data/lib/traject/macros/transformation.rb +162 -0
- data/lib/traject/marc_extractor.rb +2 -0
- data/lib/traject/ndj_reader.rb +1 -1
- data/lib/traject/nokogiri_reader.rb +179 -0
- data/lib/traject/oai_pmh_nokogiri_reader.rb +159 -0
- data/lib/traject/solr_json_writer.rb +19 -12
- data/lib/traject/thread_pool.rb +13 -0
- data/lib/traject/util.rb +14 -2
- data/lib/traject/version.rb +1 -1
- data/test/debug_writer_test.rb +3 -3
- data/test/delimited_writer_test.rb +3 -3
- data/test/experimental_nokogiri_streaming_reader_test.rb +169 -0
- data/test/indexer/context_test.rb +23 -13
- data/test/indexer/error_handler_test.rb +59 -0
- data/test/indexer/macros/macros_marc21_semantics_test.rb +46 -46
- data/test/indexer/macros/marc21/extract_all_marc_values_test.rb +1 -1
- data/test/indexer/macros/marc21/extract_marc_test.rb +19 -9
- data/test/indexer/macros/marc21/serialize_marc_test.rb +4 -4
- data/test/indexer/macros/to_field_test.rb +2 -2
- data/test/indexer/macros/transformation_test.rb +177 -0
- data/test/indexer/map_record_test.rb +2 -3
- data/test/indexer/nokogiri_indexer_test.rb +103 -0
- data/test/indexer/process_record_test.rb +55 -0
- data/test/indexer/process_with_test.rb +148 -0
- data/test/indexer/read_write_test.rb +52 -2
- data/test/indexer/settings_test.rb +34 -24
- data/test/indexer/to_field_test.rb +27 -2
- data/test/marc_extractor_test.rb +7 -7
- data/test/marc_reader_test.rb +4 -4
- data/test/nokogiri_reader_test.rb +158 -0
- data/test/oai_pmh_nokogiri_reader_test.rb +23 -0
- data/test/solr_json_writer_test.rb +24 -28
- data/test/test_helper.rb +8 -2
- data/test/test_support/namespace-test.xml +7 -0
- data/test/test_support/nokogiri_demo_config.rb +17 -0
- data/test/test_support/oai-pmh-one-record-2.xml +24 -0
- data/test/test_support/oai-pmh-one-record-first.xml +24 -0
- data/test/test_support/sample-oai-no-namespace.xml +197 -0
- data/test/test_support/sample-oai-pmh.xml +197 -0
- data/test/thread_pool_test.rb +38 -0
- data/test/translation_map_test.rb +3 -3
- data/test/translation_maps/ruby_map.rb +2 -1
- data/test/translation_maps/yaml_map.yaml +2 -1
- data/traject.gemspec +4 -11
- metadata +92 -6
@@ -0,0 +1,43 @@
|
|
1
|
+
module Traject
|
2
|
+
module Macros
|
3
|
+
module NokogiriMacros
|
4
|
+
|
5
|
+
def default_namespaces
|
6
|
+
@default_namespaces ||= (settings["nokogiri.namespaces"] || {}).tap { |ns|
|
7
|
+
unless ns.kind_of?(Hash)
|
8
|
+
raise ArgumentError, "nokogiri.namespaces must be a hash, not: #{ns.inspect}"
|
9
|
+
end
|
10
|
+
}
|
11
|
+
end
|
12
|
+
|
13
|
+
def extract_xpath(xpath, ns: {}, to_text: true)
|
14
|
+
if ns && ns.length > 0
|
15
|
+
namespaces = default_namespaces.merge(ns)
|
16
|
+
else
|
17
|
+
namespaces = default_namespaces
|
18
|
+
end
|
19
|
+
|
20
|
+
lambda do |record, accumulator|
|
21
|
+
result = record.xpath(xpath, namespaces)
|
22
|
+
|
23
|
+
if to_text
|
24
|
+
# take all matches, for each match take all
|
25
|
+
# text content, join it together separated with spaces
|
26
|
+
# Make sure to avoid text content that was all blank, which is "between the children"
|
27
|
+
# whitespace.
|
28
|
+
result = result.collect do |n|
|
29
|
+
n.xpath('.//text()').collect(&:text).tap do |arr|
|
30
|
+
arr.reject! { |s| s =~ (/\A\s+\z/) }
|
31
|
+
end.join(" ")
|
32
|
+
end
|
33
|
+
else
|
34
|
+
# just put all matches in accumulator as Nokogiri::XML::Node's
|
35
|
+
result = result.to_a
|
36
|
+
end
|
37
|
+
|
38
|
+
accumulator.concat result
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,162 @@
|
|
1
|
+
module Traject
|
2
|
+
module Macros
|
3
|
+
# Macros intended to be mixed into an Indexer and used in config
|
4
|
+
# as second or further args to #to_field, to transform existing accumulator values.
|
5
|
+
#
|
6
|
+
# They have the same form as any proc/block passed to #to_field, but
|
7
|
+
# operate on an existing accumulator, intended to be used as non-first-step
|
8
|
+
# transformations.
|
9
|
+
#
|
10
|
+
# Some of these are extracted from extract_marc options, so they can be
|
11
|
+
# used with any first-step extract methods. Some informed by current users.
|
12
|
+
module Transformation
|
13
|
+
|
14
|
+
# Maps all values on accumulator through a Traject::TranslationMap.
|
15
|
+
#
|
16
|
+
# A Traject::TranslationMap is hash-like mapping from input to output, usually
|
17
|
+
# defined in a yaml or dot-properties file, which can be looked up in load path
|
18
|
+
# with a file name as arg. See [Traject::TranslationMap](../translation_map.rb)
|
19
|
+
# header coments for details.
|
20
|
+
#
|
21
|
+
# Using this macro, you can pass in one TranslationMap initializer arg, but you can
|
22
|
+
# also pass in multiple, and they will be merged into each other (last one last), so
|
23
|
+
# you can use this to apply over-rides: Either from another on-disk map, or even from
|
24
|
+
# an inline hash (since a Hash is a valid TranslationMap initialization arg too).
|
25
|
+
#
|
26
|
+
# @example
|
27
|
+
# to_field("something"), to_field "cataloging_agency", extract_marc("040a"), translation_map("marc_040a")
|
28
|
+
#
|
29
|
+
# @example with override
|
30
|
+
# to_field("something"), to_field "cataloging_agency", extract_marc("040a"), translation_map("marc_040a", "local_marc_040a")
|
31
|
+
#
|
32
|
+
# @example with multiple overrides, including local hash
|
33
|
+
# to_field("something"), to_field "cataloging_agency", extract_marc("040a"), translation_map("marc_040a", "local_marc_040a", {"DLC" => "U.S. LoC"})
|
34
|
+
def translation_map(*translation_map_specifier)
|
35
|
+
translation_map = translation_map_specifier.
|
36
|
+
collect { |spec| Traject::TranslationMap.new(spec) }.
|
37
|
+
reduce(:merge)
|
38
|
+
|
39
|
+
lambda do |rec, acc|
|
40
|
+
translation_map.translate_array! acc
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Pass in a proc/lambda arg or a block (or both), that will be called on each
|
45
|
+
# value already in the accumulator, to transform it. (Ie, with `#map!`/`#collect!` on your proc(s)).
|
46
|
+
#
|
47
|
+
# Due to how ruby syntax precedence works, the block form is probably not too useful
|
48
|
+
# in traject config files, except with the `&:` trick.
|
49
|
+
#
|
50
|
+
# The "stabby lambda" may be convenient for passing an explicit proc argument.
|
51
|
+
#
|
52
|
+
# You can pass both an explicit proc arg and a block, in which case the proc arg
|
53
|
+
# will be applied first.
|
54
|
+
#
|
55
|
+
# @example
|
56
|
+
# to_field("something"), extract_marc("something"), transform(&:upcase)
|
57
|
+
#
|
58
|
+
# @example
|
59
|
+
# to_field("something"), extract_marc("something"), transform(->(val) { val.tr('^a-z', "\uFFFD") })
|
60
|
+
def transform(a_proc=nil, &block)
|
61
|
+
unless a_proc || block
|
62
|
+
raise ArgumentError, "Needs a transform proc arg or block arg"
|
63
|
+
end
|
64
|
+
|
65
|
+
transformer_callable = if a_proc && block
|
66
|
+
# need to make a combo wrapper.
|
67
|
+
->(val) { block.call(a_proc.call(val)) }
|
68
|
+
elsif a_proc
|
69
|
+
a_proc
|
70
|
+
else
|
71
|
+
block
|
72
|
+
end
|
73
|
+
|
74
|
+
lambda do |rec, acc|
|
75
|
+
acc.collect! do |value|
|
76
|
+
transformer_callable.call(value)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# Adds a literal to accumulator if accumulator was empty
|
82
|
+
#
|
83
|
+
# @example
|
84
|
+
# to_field "title", extract_marc("245abc"), default("Unknown Title")
|
85
|
+
def default(default_value)
|
86
|
+
lambda do |rec, acc|
|
87
|
+
if acc.empty?
|
88
|
+
acc << default_value
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
# Removes all but the first value from accumulator, if more values were present.
|
94
|
+
#
|
95
|
+
# @example
|
96
|
+
# to_field "main_author", extract_marc("100"), first_only
|
97
|
+
def first_only
|
98
|
+
lambda do |rec, acc|
|
99
|
+
# kind of esoteric, but slice used this way does mutating first, yep
|
100
|
+
acc.slice!(1, acc.length)
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
|
105
|
+
# calls ruby `uniq!` on accumulator, removes any duplicate values
|
106
|
+
#
|
107
|
+
# @example
|
108
|
+
# to_field "something", extract_marc("245:240"), unique
|
109
|
+
def unique
|
110
|
+
lambda do |rec, acc|
|
111
|
+
acc.uniq!
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
|
116
|
+
# For each value in accumulator, remove all leading or trailing whitespace
|
117
|
+
# (unique aware). Like ruby #strip, but whitespace-aware
|
118
|
+
#
|
119
|
+
# @example
|
120
|
+
# to_field "title", extract_marc("245"), strip
|
121
|
+
def strip
|
122
|
+
lambda do |rec, acc|
|
123
|
+
acc.collect! do |v|
|
124
|
+
# unicode whitespace class aware
|
125
|
+
v.sub(/\A[[:space:]]+/,'').sub(/[[:space:]]+\Z/, '')
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
# Run ruby `split` on each value in the accumulator, with separator
|
131
|
+
# given, flatten all results into single array as accumulator.
|
132
|
+
# Will generally result in more individual values in accumulator as output than were
|
133
|
+
# there in input, as input values are split up into multiple values.
|
134
|
+
def split(separator)
|
135
|
+
lambda do |rec, acc|
|
136
|
+
acc.replace( acc.flat_map { |v| v.split(separator) } )
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
# Append argument to end of each value in accumulator.
|
141
|
+
def append(suffix)
|
142
|
+
lambda do |rec, acc|
|
143
|
+
acc.collect! { |v| v + suffix }
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
# prepend argument to beginning of each value in accumulator.
|
148
|
+
def prepend(prefix)
|
149
|
+
lambda do |rec, acc|
|
150
|
+
acc.collect! { |v| prefix + v }
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
# Run ruby `gsub` on each value in accumulator, with pattern and replace value given.
|
155
|
+
def gsub(pattern, replace)
|
156
|
+
lambda do |rec, acc|
|
157
|
+
acc.collect! { |v| v.gsub(pattern, replace) }
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
@@ -151,6 +151,8 @@ module Traject
|
|
151
151
|
if options[:alternate_script] != false
|
152
152
|
@fetch_alternate_script = true
|
153
153
|
show_interest_in_tag(ALTERNATE_SCRIPT_TAG)
|
154
|
+
else
|
155
|
+
@fetch_alternate_script = false
|
154
156
|
end
|
155
157
|
|
156
158
|
@interesting_tags_list = @interesting_tags_hash.keys
|
data/lib/traject/ndj_reader.rb
CHANGED
@@ -12,7 +12,7 @@ class Traject::NDJReader
|
|
12
12
|
def initialize(input_stream, settings)
|
13
13
|
@settings = settings
|
14
14
|
@input_stream = input_stream
|
15
|
-
if /\.gz\Z/.match(
|
15
|
+
if input_stream.respond_to?(:path) && /\.gz\Z/.match(input_stream.path)
|
16
16
|
@input_stream = Zlib::GzipReader.new(@input_stream, :external_encoding => "UTF-8")
|
17
17
|
end
|
18
18
|
end
|
@@ -0,0 +1,179 @@
|
|
1
|
+
module Traject
|
2
|
+
# A Trajet reader which reads XML, and yields zero to many Nokogiri::XML::Document
|
3
|
+
# objects as source records in the traject pipeline.
|
4
|
+
#
|
5
|
+
# It does process the entire input document with Nokogiri::XML.parse, DOM-parsing,
|
6
|
+
# so will take RAM for the entire input document, until iteration completes.
|
7
|
+
# (There is a separate half-finished `ExperimentalStreamingNokogiriReader` available, but it is
|
8
|
+
# experimental, half-finished, may disappear or change in backwards compat at any time, problematic,
|
9
|
+
# not recommended for production use, etc.)
|
10
|
+
#
|
11
|
+
# You can have it yield the _entire_ input XML as a single traject source record
|
12
|
+
# (default), or you can use setting `nokogiri.each_record_xpath` to split
|
13
|
+
# the source up into separate records to yield into traject pipeline -- each one
|
14
|
+
# will be it's own Nokogiri::XML::Document.
|
15
|
+
#
|
16
|
+
# ## Settings
|
17
|
+
# * nokogiri.default_namespaces: Set namespace prefixes that can be used in
|
18
|
+
# other settings, including `extract_xpath` from NokogiriMacros.
|
19
|
+
# * nokogiri.each_record_xpath: if set to a string xpath, will take all matching nodes
|
20
|
+
# from the input doc, and yield the individually as source records to the pipeline.
|
21
|
+
# If you need to use namespaces here, you need to have them registered with
|
22
|
+
# `nokogiri.default_namespaces`. If your source docs use namespaces, you DO need
|
23
|
+
# to use them in your each_record_xpath.
|
24
|
+
# * nokogiri_reader.extra_xpath_hooks: Experimental in progress, see below.
|
25
|
+
#
|
26
|
+
# ## nokogiri_reader.extra_xpath_hooks: For handling nodes outside of your each_record_xpath
|
27
|
+
#
|
28
|
+
# What if you want to use each_record_xpath to yield certain nodes as source documents, but
|
29
|
+
# there is additional some other info in other parts of the input document you need? This came up
|
30
|
+
# when developing the OaiPmhNokogiriReader, which yields "//oai:record" as pipeline source documents,
|
31
|
+
# but also needed to look at "//oai:resumptionToken" to scrape the entire results.
|
32
|
+
#
|
33
|
+
# There is a semi-finished/in-progress feature that meets that use case -- unclear if it will meet
|
34
|
+
# all use cases for this general issue.
|
35
|
+
#
|
36
|
+
# Setting `nokogiri_reader.extra_xpath_hooks` can be set to a Hash where the keys are xpaths (if using
|
37
|
+
# namespaces must be must be registered with `nokogiri.default_namespaces`), and the value is a lambda/
|
38
|
+
# proc/callable object, taking two arguments.
|
39
|
+
#
|
40
|
+
# provide "nokogiri_reader.extra_xpath_hooks", {
|
41
|
+
# "//oai:resumptionToken" =>
|
42
|
+
# lambda do |node, clipboard|
|
43
|
+
# clipboard[:resumption_token] = node.text
|
44
|
+
# end"
|
45
|
+
# }
|
46
|
+
#
|
47
|
+
# The first arg is the matching node. What's this clipboard? Well, what are you
|
48
|
+
# gonna _do_ with what you get out of there, that you can do in a thread-safe way
|
49
|
+
# in the middle of nokogiri processing? The second arg is a thread-safe Hash "clipboard"
|
50
|
+
# that you can store things in, and later access via reader.clipboard.
|
51
|
+
#
|
52
|
+
# There's no great thread-safe way to get reader.clipboard in a normal nokogiri pipeline though,
|
53
|
+
# (the reader can change in multi-file handling so there can be a race condition if you try naively,
|
54
|
+
# don't!) Which is why this feature needs some work for general applicability. The OaiPmhReader
|
55
|
+
# manually creates it's readers outside the usual nokogiri flow, so can use it.
|
56
|
+
class NokogiriReader
|
57
|
+
include Enumerable
|
58
|
+
|
59
|
+
attr_reader :settings, :input_stream, :clipboard, :path_tracker
|
60
|
+
|
61
|
+
def initialize(input_stream, settings)
|
62
|
+
@settings = Traject::Indexer::Settings.new settings
|
63
|
+
@input_stream = input_stream
|
64
|
+
@clipboard = Traject::Util.is_jruby? ? Concurrent::Map.new : Concurrent::Hash.new
|
65
|
+
|
66
|
+
default_namespaces # trigger validation
|
67
|
+
validate_xpath(each_record_xpath, key_name: "each_record_xpath") if each_record_xpath
|
68
|
+
extra_xpath_hooks.each_pair do |xpath, _callable|
|
69
|
+
validate_xpath(xpath, key_name: "extra_xpath_hooks")
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def each_record_xpath
|
74
|
+
@each_record_xpath ||= settings["nokogiri.each_record_xpath"]
|
75
|
+
end
|
76
|
+
|
77
|
+
def extra_xpath_hooks
|
78
|
+
@extra_xpath_hooks ||= settings["nokogiri_reader.extra_xpath_hooks"] || {}
|
79
|
+
end
|
80
|
+
|
81
|
+
def default_namespaces
|
82
|
+
@default_namespaces ||= (settings["nokogiri.namespaces"] || {}).tap { |ns|
|
83
|
+
unless ns.kind_of?(Hash)
|
84
|
+
raise ArgumentError, "nokogiri.namespaces must be a hash, not: #{ns.inspect}"
|
85
|
+
end
|
86
|
+
}
|
87
|
+
end
|
88
|
+
|
89
|
+
def each
|
90
|
+
whole_input_doc = Nokogiri::XML.parse(input_stream)
|
91
|
+
|
92
|
+
if each_record_xpath
|
93
|
+
whole_input_doc.xpath(each_record_xpath, default_namespaces).each do |matching_node|
|
94
|
+
# We want to take the matching node, and make it into root in a new Nokogiri document.
|
95
|
+
# This is tricky to do as performant as possible (we want to re-use the existing libxml node),
|
96
|
+
# while preserving namespaces properly (especially in jruby). Some uses of noko api that seem
|
97
|
+
# like they should work don't, esp in jruby.
|
98
|
+
child_doc = Nokogiri::XML::Document.new
|
99
|
+
|
100
|
+
reparent_node_to_root(child_doc, matching_node)
|
101
|
+
|
102
|
+
yield child_doc
|
103
|
+
|
104
|
+
child_doc = nil # hopefully make things easier on the GC.
|
105
|
+
end
|
106
|
+
else
|
107
|
+
# caller wants whole doc as a traject source record
|
108
|
+
yield whole_input_doc
|
109
|
+
end
|
110
|
+
|
111
|
+
run_extra_xpath_hooks(whole_input_doc)
|
112
|
+
|
113
|
+
ensure
|
114
|
+
# hopefully make things easier on the GC.
|
115
|
+
whole_input_doc = nil
|
116
|
+
end
|
117
|
+
|
118
|
+
private
|
119
|
+
|
120
|
+
|
121
|
+
# In MRI Nokogiri, this is as simple as `new_parent_doc.root = node`
|
122
|
+
# It seemed maybe safer to dup the node as well as remove the original from the original doc,
|
123
|
+
# but I believe this will result in double memory usage, as unlinked nodes aren't GC'd until
|
124
|
+
# their doc is. I am hoping this pattern results in less memory usage.
|
125
|
+
# https://github.com/sparklemotion/nokogiri/issues/1703
|
126
|
+
#
|
127
|
+
# However, in JRuby it's a different story, JRuby doesn't properly preserve namespaces
|
128
|
+
# when re-parenting a node.
|
129
|
+
# https://github.com/sparklemotion/nokogiri/issues/1774
|
130
|
+
#
|
131
|
+
# The nodes within the tree re-parented _know_ they are in the correct namespaces,
|
132
|
+
# and xpath queries require that namespace, but the appropriate xmlns attributes
|
133
|
+
# aren't included in the serialized XML. This JRuby-specific code seems to get
|
134
|
+
# things back to a consistent state.
|
135
|
+
def reparent_node_to_root(new_parent_doc, node)
|
136
|
+
if Traject::Util.is_jruby?
|
137
|
+
original_ns_scopes = node.namespace_scopes
|
138
|
+
end
|
139
|
+
|
140
|
+
new_parent_doc.root = node
|
141
|
+
|
142
|
+
if Traject::Util.is_jruby?
|
143
|
+
original_ns_scopes.each do |ns|
|
144
|
+
if new_parent_doc.at_xpath("//#{ns.prefix}:*", ns.prefix => ns.href)
|
145
|
+
new_parent_doc.root.add_namespace(ns.prefix, ns.href)
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
return new_parent_doc
|
151
|
+
end
|
152
|
+
|
153
|
+
def validate_xpath(xpath, key_name:)
|
154
|
+
components = each_record_xpath.split('/')
|
155
|
+
components.each do |component|
|
156
|
+
prefix, element = component.split(':')
|
157
|
+
unless element
|
158
|
+
# there was no namespace
|
159
|
+
prefix, element = nil, prefix
|
160
|
+
end
|
161
|
+
|
162
|
+
if prefix
|
163
|
+
ns_uri = default_namespaces[prefix]
|
164
|
+
if ns_uri.nil?
|
165
|
+
raise ArgumentError, "#{key_name}: Can't find namespace prefix '#{prefix}' in '#{each_record_xpath}'. To use a namespace in each_record_xpath, it has to be registered with nokogiri.namespaces: #{default_namespaces.inspect}"
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
def run_extra_xpath_hooks(noko_doc)
|
172
|
+
extra_xpath_hooks.each_pair do |xpath, callable|
|
173
|
+
noko_doc.xpath(xpath, default_namespaces).each do |matching_node|
|
174
|
+
callable.call(matching_node, clipboard)
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
@@ -0,0 +1,159 @@
|
|
1
|
+
require 'uri'
|
2
|
+
require 'cgi'
|
3
|
+
require 'http'
|
4
|
+
|
5
|
+
module Traject
|
6
|
+
# Reads an OAI feed via HTTP and feeds it directly to a traject pipeline. You don't HAVE to use
|
7
|
+
# this to read oai-pmh, you might choose to fetch and store OAI-PMH responses to disk yourself,
|
8
|
+
# and then process as ordinary XML.
|
9
|
+
#
|
10
|
+
# Example command line:
|
11
|
+
#
|
12
|
+
# traject -i xml -r Traject::OaiPmhNokogiriReader -s oai_pmh.start_url="http://example.com/oai?verb=ListRecords&metadataPrefix=oai_dc" -c your_config.rb
|
13
|
+
#
|
14
|
+
# ## Settings
|
15
|
+
#
|
16
|
+
# * oai_pmh.start_url: Required, eg "http://example.com/oai?verb=ListRecords&metadataPrefix=oai_dc"
|
17
|
+
# * oai_pmh.timeout: (default 10) timeout for http.rb in seconds
|
18
|
+
# * oai_pmh.try_gzip: (default true). Ask server for gzip response if available
|
19
|
+
# * oai_pmh.http_persistent: (default true). Use persistent HTTP connections.
|
20
|
+
#
|
21
|
+
# ## JRUBY NOTES:
|
22
|
+
# * Does not work with jruby 9.2 until http.rb does: https://github.com/httprb/http/issues/475
|
23
|
+
# * JRuby version def reads whole http response into memory before parsing; MRI version might do this too, but might not?
|
24
|
+
#
|
25
|
+
# ## TO DO
|
26
|
+
#
|
27
|
+
# This would be a lot more useful with some sort of built-in HTTP caching.
|
28
|
+
class OaiPmhNokogiriReader
|
29
|
+
include Enumerable
|
30
|
+
|
31
|
+
attr_reader :settings, :input_stream
|
32
|
+
|
33
|
+
def initialize(input_stream, settings)
|
34
|
+
namespaces = (settings["nokogiri.namespaces"] || {}).merge(
|
35
|
+
"oai" => "http://www.openarchives.org/OAI/2.0/"
|
36
|
+
)
|
37
|
+
|
38
|
+
|
39
|
+
@settings = Traject::Indexer::Settings.new(
|
40
|
+
"nokogiri_reader.extra_xpath_hooks" => extra_xpath_hooks,
|
41
|
+
"nokogiri.each_record_xpath" => "/oai:OAI-PMH/oai:ListRecords/oai:record",
|
42
|
+
"nokogiri.namespaces" => namespaces
|
43
|
+
).with_defaults(
|
44
|
+
"oai_pmh.timeout" => 10,
|
45
|
+
"oai_pmh.try_gzip" => true,
|
46
|
+
"oai_pmh.http_persistent" => true
|
47
|
+
).fill_in_defaults!.merge(settings)
|
48
|
+
|
49
|
+
@input_stream = input_stream
|
50
|
+
end
|
51
|
+
|
52
|
+
def start_url
|
53
|
+
settings["oai_pmh.start_url"] or raise ArgumentError.new("#{self.class.name} needs a setting 'oai_pmh.start_url'")
|
54
|
+
end
|
55
|
+
|
56
|
+
def start_url_verb
|
57
|
+
@start_url_verb ||= (array = CGI.parse(URI.parse(start_url).query)["verb"]) && array.first
|
58
|
+
end
|
59
|
+
|
60
|
+
def extra_xpath_hooks
|
61
|
+
@extra_xpath_hooks ||= {
|
62
|
+
"//oai:resumptionToken" =>
|
63
|
+
lambda do |doc, clipboard|
|
64
|
+
token = doc.text
|
65
|
+
if token && token != ""
|
66
|
+
clipboard[:resumption_token] = token
|
67
|
+
end
|
68
|
+
end
|
69
|
+
}
|
70
|
+
end
|
71
|
+
|
72
|
+
def each
|
73
|
+
url = start_url
|
74
|
+
|
75
|
+
resumption_token = nil
|
76
|
+
last_resumption_token = nil
|
77
|
+
pages_fetched = 0
|
78
|
+
|
79
|
+
until url == nil
|
80
|
+
resumption_token = read_and_parse_response(url) do |record|
|
81
|
+
yield record
|
82
|
+
end
|
83
|
+
url = resumption_url(resumption_token)
|
84
|
+
(last_resumption_token = resumption_token) if resumption_token
|
85
|
+
pages_fetched += 1
|
86
|
+
end
|
87
|
+
|
88
|
+
logger.info("#{self.class.name}: fetched #{pages_fetched} pages; last resumptionToken found: #{last_resumption_token.inspect}")
|
89
|
+
end
|
90
|
+
|
91
|
+
def resumption_url(resumption_token)
|
92
|
+
return nil if resumption_token.nil? || resumption_token == ""
|
93
|
+
|
94
|
+
# resumption URL is just original verb with resumption token, that seems to be
|
95
|
+
# the oai-pmh spec.
|
96
|
+
parsed_uri = URI.parse(start_url)
|
97
|
+
parsed_uri.query = "verb=#{CGI.escape start_url_verb}&resumptionToken=#{CGI.escape resumption_token}"
|
98
|
+
parsed_uri.to_s
|
99
|
+
end
|
100
|
+
|
101
|
+
def timeout
|
102
|
+
settings["oai_pmh.timeout"]
|
103
|
+
end
|
104
|
+
|
105
|
+
def logger
|
106
|
+
@logger ||= (@settings[:logger] || Yell.new(STDERR, :level => "gt.fatal")) # null logger)
|
107
|
+
end
|
108
|
+
|
109
|
+
private
|
110
|
+
|
111
|
+
# re-use an http-client for subsequent requests, to get http.rb's persistent connection re-use
|
112
|
+
# Note this means this is NOT thread safe, which is fine for now, but we'd have to do something
|
113
|
+
# different if we tried to multi-thread reading multiple files or something.
|
114
|
+
#
|
115
|
+
# @returns [HTTP::Client] from http.rb gem
|
116
|
+
def http_client
|
117
|
+
@http_client ||= begin
|
118
|
+
# timeout setting on http.rb seems to be a mess.
|
119
|
+
# https://github.com/httprb/http/issues/488
|
120
|
+
client = HTTP.timeout(:global, write: timeout / 3, connect: timeout / 3, read: timeout / 3)
|
121
|
+
|
122
|
+
if settings["oai_pmh.try_gzip"]
|
123
|
+
client = client.use(:auto_inflate).headers("accept-encoding" => "gzip;q=1.0, identity;q=0.5")
|
124
|
+
end
|
125
|
+
|
126
|
+
if settings["oai_pmh.http_persistent"]
|
127
|
+
parsed_uri = URI.parse(start_url)
|
128
|
+
client = client.persistent("#{parsed_uri.scheme}://#{parsed_uri.host}")
|
129
|
+
end
|
130
|
+
|
131
|
+
client
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
def read_and_parse_response(url)
|
136
|
+
http_response = http_client.get(url)
|
137
|
+
|
138
|
+
#File.write("our_oai/#{Time.now.to_i}.xml", body)
|
139
|
+
|
140
|
+
# Not sure why JRuby Nokogiri requires us to call #to_s on it first;
|
141
|
+
# not sure if this has perf implications. In either case, not sure
|
142
|
+
# if we are reading a separate copy of response into memory, or if Noko
|
143
|
+
# consumes it streaming. Trying to explicitly stream it to nokogiri, using
|
144
|
+
# http.rb#readpartial, just gave us a big headache.
|
145
|
+
noko_source_arg = if Traject::Util.is_jruby?
|
146
|
+
http_response.body.to_s
|
147
|
+
else
|
148
|
+
http_response.body
|
149
|
+
end
|
150
|
+
|
151
|
+
reader = Traject::NokogiriReader.new(noko_source_arg, settings)
|
152
|
+
|
153
|
+
reader.each { |d| yield d }
|
154
|
+
|
155
|
+
return reader.clipboard[:resumption_token]
|
156
|
+
end
|
157
|
+
|
158
|
+
end
|
159
|
+
end
|