traject 2.3.4 → 3.0.0.alpha.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (69) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +16 -9
  3. data/CHANGES.md +74 -1
  4. data/Gemfile +2 -1
  5. data/README.md +104 -53
  6. data/Rakefile +8 -1
  7. data/doc/indexing_rules.md +79 -63
  8. data/doc/programmatic_use.md +218 -0
  9. data/doc/settings.md +28 -1
  10. data/doc/xml.md +134 -0
  11. data/lib/traject.rb +5 -0
  12. data/lib/traject/array_writer.rb +34 -0
  13. data/lib/traject/command_line.rb +18 -22
  14. data/lib/traject/debug_writer.rb +2 -5
  15. data/lib/traject/experimental_nokogiri_streaming_reader.rb +276 -0
  16. data/lib/traject/hashie/indifferent_access_fix.rb +25 -0
  17. data/lib/traject/indexer.rb +321 -92
  18. data/lib/traject/indexer/context.rb +39 -13
  19. data/lib/traject/indexer/marc_indexer.rb +30 -0
  20. data/lib/traject/indexer/nokogiri_indexer.rb +30 -0
  21. data/lib/traject/indexer/settings.rb +36 -53
  22. data/lib/traject/indexer/step.rb +27 -33
  23. data/lib/traject/macros/marc21.rb +37 -12
  24. data/lib/traject/macros/nokogiri_macros.rb +43 -0
  25. data/lib/traject/macros/transformation.rb +162 -0
  26. data/lib/traject/marc_extractor.rb +2 -0
  27. data/lib/traject/ndj_reader.rb +1 -1
  28. data/lib/traject/nokogiri_reader.rb +179 -0
  29. data/lib/traject/oai_pmh_nokogiri_reader.rb +159 -0
  30. data/lib/traject/solr_json_writer.rb +19 -12
  31. data/lib/traject/thread_pool.rb +13 -0
  32. data/lib/traject/util.rb +14 -2
  33. data/lib/traject/version.rb +1 -1
  34. data/test/debug_writer_test.rb +3 -3
  35. data/test/delimited_writer_test.rb +3 -3
  36. data/test/experimental_nokogiri_streaming_reader_test.rb +169 -0
  37. data/test/indexer/context_test.rb +23 -13
  38. data/test/indexer/error_handler_test.rb +59 -0
  39. data/test/indexer/macros/macros_marc21_semantics_test.rb +46 -46
  40. data/test/indexer/macros/marc21/extract_all_marc_values_test.rb +1 -1
  41. data/test/indexer/macros/marc21/extract_marc_test.rb +19 -9
  42. data/test/indexer/macros/marc21/serialize_marc_test.rb +4 -4
  43. data/test/indexer/macros/to_field_test.rb +2 -2
  44. data/test/indexer/macros/transformation_test.rb +177 -0
  45. data/test/indexer/map_record_test.rb +2 -3
  46. data/test/indexer/nokogiri_indexer_test.rb +103 -0
  47. data/test/indexer/process_record_test.rb +55 -0
  48. data/test/indexer/process_with_test.rb +148 -0
  49. data/test/indexer/read_write_test.rb +52 -2
  50. data/test/indexer/settings_test.rb +34 -24
  51. data/test/indexer/to_field_test.rb +27 -2
  52. data/test/marc_extractor_test.rb +7 -7
  53. data/test/marc_reader_test.rb +4 -4
  54. data/test/nokogiri_reader_test.rb +158 -0
  55. data/test/oai_pmh_nokogiri_reader_test.rb +23 -0
  56. data/test/solr_json_writer_test.rb +24 -28
  57. data/test/test_helper.rb +8 -2
  58. data/test/test_support/namespace-test.xml +7 -0
  59. data/test/test_support/nokogiri_demo_config.rb +17 -0
  60. data/test/test_support/oai-pmh-one-record-2.xml +24 -0
  61. data/test/test_support/oai-pmh-one-record-first.xml +24 -0
  62. data/test/test_support/sample-oai-no-namespace.xml +197 -0
  63. data/test/test_support/sample-oai-pmh.xml +197 -0
  64. data/test/thread_pool_test.rb +38 -0
  65. data/test/translation_map_test.rb +3 -3
  66. data/test/translation_maps/ruby_map.rb +2 -1
  67. data/test/translation_maps/yaml_map.yaml +2 -1
  68. data/traject.gemspec +4 -11
  69. metadata +92 -6
@@ -0,0 +1,43 @@
1
+ module Traject
2
+ module Macros
3
+ module NokogiriMacros
4
+
5
+ def default_namespaces
6
+ @default_namespaces ||= (settings["nokogiri.namespaces"] || {}).tap { |ns|
7
+ unless ns.kind_of?(Hash)
8
+ raise ArgumentError, "nokogiri.namespaces must be a hash, not: #{ns.inspect}"
9
+ end
10
+ }
11
+ end
12
+
13
+ def extract_xpath(xpath, ns: {}, to_text: true)
14
+ if ns && ns.length > 0
15
+ namespaces = default_namespaces.merge(ns)
16
+ else
17
+ namespaces = default_namespaces
18
+ end
19
+
20
+ lambda do |record, accumulator|
21
+ result = record.xpath(xpath, namespaces)
22
+
23
+ if to_text
24
+ # take all matches, for each match take all
25
+ # text content, join it together separated with spaces
26
+ # Make sure to avoid text content that was all blank, which is "between the children"
27
+ # whitespace.
28
+ result = result.collect do |n|
29
+ n.xpath('.//text()').collect(&:text).tap do |arr|
30
+ arr.reject! { |s| s =~ (/\A\s+\z/) }
31
+ end.join(" ")
32
+ end
33
+ else
34
+ # just put all matches in accumulator as Nokogiri::XML::Node's
35
+ result = result.to_a
36
+ end
37
+
38
+ accumulator.concat result
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,162 @@
1
+ module Traject
2
+ module Macros
3
+ # Macros intended to be mixed into an Indexer and used in config
4
+ # as second or further args to #to_field, to transform existing accumulator values.
5
+ #
6
+ # They have the same form as any proc/block passed to #to_field, but
7
+ # operate on an existing accumulator, intended to be used as non-first-step
8
+ # transformations.
9
+ #
10
+ # Some of these are extracted from extract_marc options, so they can be
11
+ # used with any first-step extract methods. Some informed by current users.
12
+ module Transformation
13
+
14
+ # Maps all values on accumulator through a Traject::TranslationMap.
15
+ #
16
+ # A Traject::TranslationMap is hash-like mapping from input to output, usually
17
+ # defined in a yaml or dot-properties file, which can be looked up in load path
18
+ # with a file name as arg. See [Traject::TranslationMap](../translation_map.rb)
19
+ # header coments for details.
20
+ #
21
+ # Using this macro, you can pass in one TranslationMap initializer arg, but you can
22
+ # also pass in multiple, and they will be merged into each other (last one last), so
23
+ # you can use this to apply over-rides: Either from another on-disk map, or even from
24
+ # an inline hash (since a Hash is a valid TranslationMap initialization arg too).
25
+ #
26
+ # @example
27
+ # to_field("something"), to_field "cataloging_agency", extract_marc("040a"), translation_map("marc_040a")
28
+ #
29
+ # @example with override
30
+ # to_field("something"), to_field "cataloging_agency", extract_marc("040a"), translation_map("marc_040a", "local_marc_040a")
31
+ #
32
+ # @example with multiple overrides, including local hash
33
+ # to_field("something"), to_field "cataloging_agency", extract_marc("040a"), translation_map("marc_040a", "local_marc_040a", {"DLC" => "U.S. LoC"})
34
+ def translation_map(*translation_map_specifier)
35
+ translation_map = translation_map_specifier.
36
+ collect { |spec| Traject::TranslationMap.new(spec) }.
37
+ reduce(:merge)
38
+
39
+ lambda do |rec, acc|
40
+ translation_map.translate_array! acc
41
+ end
42
+ end
43
+
44
+ # Pass in a proc/lambda arg or a block (or both), that will be called on each
45
+ # value already in the accumulator, to transform it. (Ie, with `#map!`/`#collect!` on your proc(s)).
46
+ #
47
+ # Due to how ruby syntax precedence works, the block form is probably not too useful
48
+ # in traject config files, except with the `&:` trick.
49
+ #
50
+ # The "stabby lambda" may be convenient for passing an explicit proc argument.
51
+ #
52
+ # You can pass both an explicit proc arg and a block, in which case the proc arg
53
+ # will be applied first.
54
+ #
55
+ # @example
56
+ # to_field("something"), extract_marc("something"), transform(&:upcase)
57
+ #
58
+ # @example
59
+ # to_field("something"), extract_marc("something"), transform(->(val) { val.tr('^a-z', "\uFFFD") })
60
+ def transform(a_proc=nil, &block)
61
+ unless a_proc || block
62
+ raise ArgumentError, "Needs a transform proc arg or block arg"
63
+ end
64
+
65
+ transformer_callable = if a_proc && block
66
+ # need to make a combo wrapper.
67
+ ->(val) { block.call(a_proc.call(val)) }
68
+ elsif a_proc
69
+ a_proc
70
+ else
71
+ block
72
+ end
73
+
74
+ lambda do |rec, acc|
75
+ acc.collect! do |value|
76
+ transformer_callable.call(value)
77
+ end
78
+ end
79
+ end
80
+
81
+ # Adds a literal to accumulator if accumulator was empty
82
+ #
83
+ # @example
84
+ # to_field "title", extract_marc("245abc"), default("Unknown Title")
85
+ def default(default_value)
86
+ lambda do |rec, acc|
87
+ if acc.empty?
88
+ acc << default_value
89
+ end
90
+ end
91
+ end
92
+
93
+ # Removes all but the first value from accumulator, if more values were present.
94
+ #
95
+ # @example
96
+ # to_field "main_author", extract_marc("100"), first_only
97
+ def first_only
98
+ lambda do |rec, acc|
99
+ # kind of esoteric, but slice used this way does mutating first, yep
100
+ acc.slice!(1, acc.length)
101
+ end
102
+ end
103
+
104
+
105
+ # calls ruby `uniq!` on accumulator, removes any duplicate values
106
+ #
107
+ # @example
108
+ # to_field "something", extract_marc("245:240"), unique
109
+ def unique
110
+ lambda do |rec, acc|
111
+ acc.uniq!
112
+ end
113
+ end
114
+
115
+
116
+ # For each value in accumulator, remove all leading or trailing whitespace
117
+ # (unique aware). Like ruby #strip, but whitespace-aware
118
+ #
119
+ # @example
120
+ # to_field "title", extract_marc("245"), strip
121
+ def strip
122
+ lambda do |rec, acc|
123
+ acc.collect! do |v|
124
+ # unicode whitespace class aware
125
+ v.sub(/\A[[:space:]]+/,'').sub(/[[:space:]]+\Z/, '')
126
+ end
127
+ end
128
+ end
129
+
130
+ # Run ruby `split` on each value in the accumulator, with separator
131
+ # given, flatten all results into single array as accumulator.
132
+ # Will generally result in more individual values in accumulator as output than were
133
+ # there in input, as input values are split up into multiple values.
134
+ def split(separator)
135
+ lambda do |rec, acc|
136
+ acc.replace( acc.flat_map { |v| v.split(separator) } )
137
+ end
138
+ end
139
+
140
+ # Append argument to end of each value in accumulator.
141
+ def append(suffix)
142
+ lambda do |rec, acc|
143
+ acc.collect! { |v| v + suffix }
144
+ end
145
+ end
146
+
147
+ # prepend argument to beginning of each value in accumulator.
148
+ def prepend(prefix)
149
+ lambda do |rec, acc|
150
+ acc.collect! { |v| prefix + v }
151
+ end
152
+ end
153
+
154
+ # Run ruby `gsub` on each value in accumulator, with pattern and replace value given.
155
+ def gsub(pattern, replace)
156
+ lambda do |rec, acc|
157
+ acc.collect! { |v| v.gsub(pattern, replace) }
158
+ end
159
+ end
160
+ end
161
+ end
162
+ end
@@ -151,6 +151,8 @@ module Traject
151
151
  if options[:alternate_script] != false
152
152
  @fetch_alternate_script = true
153
153
  show_interest_in_tag(ALTERNATE_SCRIPT_TAG)
154
+ else
155
+ @fetch_alternate_script = false
154
156
  end
155
157
 
156
158
  @interesting_tags_list = @interesting_tags_hash.keys
@@ -12,7 +12,7 @@ class Traject::NDJReader
12
12
  def initialize(input_stream, settings)
13
13
  @settings = settings
14
14
  @input_stream = input_stream
15
- if /\.gz\Z/.match(@settings['command_line.filename'])
15
+ if input_stream.respond_to?(:path) && /\.gz\Z/.match(input_stream.path)
16
16
  @input_stream = Zlib::GzipReader.new(@input_stream, :external_encoding => "UTF-8")
17
17
  end
18
18
  end
@@ -0,0 +1,179 @@
1
+ module Traject
2
+ # A Trajet reader which reads XML, and yields zero to many Nokogiri::XML::Document
3
+ # objects as source records in the traject pipeline.
4
+ #
5
+ # It does process the entire input document with Nokogiri::XML.parse, DOM-parsing,
6
+ # so will take RAM for the entire input document, until iteration completes.
7
+ # (There is a separate half-finished `ExperimentalStreamingNokogiriReader` available, but it is
8
+ # experimental, half-finished, may disappear or change in backwards compat at any time, problematic,
9
+ # not recommended for production use, etc.)
10
+ #
11
+ # You can have it yield the _entire_ input XML as a single traject source record
12
+ # (default), or you can use setting `nokogiri.each_record_xpath` to split
13
+ # the source up into separate records to yield into traject pipeline -- each one
14
+ # will be it's own Nokogiri::XML::Document.
15
+ #
16
+ # ## Settings
17
+ # * nokogiri.default_namespaces: Set namespace prefixes that can be used in
18
+ # other settings, including `extract_xpath` from NokogiriMacros.
19
+ # * nokogiri.each_record_xpath: if set to a string xpath, will take all matching nodes
20
+ # from the input doc, and yield the individually as source records to the pipeline.
21
+ # If you need to use namespaces here, you need to have them registered with
22
+ # `nokogiri.default_namespaces`. If your source docs use namespaces, you DO need
23
+ # to use them in your each_record_xpath.
24
+ # * nokogiri_reader.extra_xpath_hooks: Experimental in progress, see below.
25
+ #
26
+ # ## nokogiri_reader.extra_xpath_hooks: For handling nodes outside of your each_record_xpath
27
+ #
28
+ # What if you want to use each_record_xpath to yield certain nodes as source documents, but
29
+ # there is additional some other info in other parts of the input document you need? This came up
30
+ # when developing the OaiPmhNokogiriReader, which yields "//oai:record" as pipeline source documents,
31
+ # but also needed to look at "//oai:resumptionToken" to scrape the entire results.
32
+ #
33
+ # There is a semi-finished/in-progress feature that meets that use case -- unclear if it will meet
34
+ # all use cases for this general issue.
35
+ #
36
+ # Setting `nokogiri_reader.extra_xpath_hooks` can be set to a Hash where the keys are xpaths (if using
37
+ # namespaces must be must be registered with `nokogiri.default_namespaces`), and the value is a lambda/
38
+ # proc/callable object, taking two arguments.
39
+ #
40
+ # provide "nokogiri_reader.extra_xpath_hooks", {
41
+ # "//oai:resumptionToken" =>
42
+ # lambda do |node, clipboard|
43
+ # clipboard[:resumption_token] = node.text
44
+ # end"
45
+ # }
46
+ #
47
+ # The first arg is the matching node. What's this clipboard? Well, what are you
48
+ # gonna _do_ with what you get out of there, that you can do in a thread-safe way
49
+ # in the middle of nokogiri processing? The second arg is a thread-safe Hash "clipboard"
50
+ # that you can store things in, and later access via reader.clipboard.
51
+ #
52
+ # There's no great thread-safe way to get reader.clipboard in a normal nokogiri pipeline though,
53
+ # (the reader can change in multi-file handling so there can be a race condition if you try naively,
54
+ # don't!) Which is why this feature needs some work for general applicability. The OaiPmhReader
55
+ # manually creates it's readers outside the usual nokogiri flow, so can use it.
56
+ class NokogiriReader
57
+ include Enumerable
58
+
59
+ attr_reader :settings, :input_stream, :clipboard, :path_tracker
60
+
61
+ def initialize(input_stream, settings)
62
+ @settings = Traject::Indexer::Settings.new settings
63
+ @input_stream = input_stream
64
+ @clipboard = Traject::Util.is_jruby? ? Concurrent::Map.new : Concurrent::Hash.new
65
+
66
+ default_namespaces # trigger validation
67
+ validate_xpath(each_record_xpath, key_name: "each_record_xpath") if each_record_xpath
68
+ extra_xpath_hooks.each_pair do |xpath, _callable|
69
+ validate_xpath(xpath, key_name: "extra_xpath_hooks")
70
+ end
71
+ end
72
+
73
+ def each_record_xpath
74
+ @each_record_xpath ||= settings["nokogiri.each_record_xpath"]
75
+ end
76
+
77
+ def extra_xpath_hooks
78
+ @extra_xpath_hooks ||= settings["nokogiri_reader.extra_xpath_hooks"] || {}
79
+ end
80
+
81
+ def default_namespaces
82
+ @default_namespaces ||= (settings["nokogiri.namespaces"] || {}).tap { |ns|
83
+ unless ns.kind_of?(Hash)
84
+ raise ArgumentError, "nokogiri.namespaces must be a hash, not: #{ns.inspect}"
85
+ end
86
+ }
87
+ end
88
+
89
+ def each
90
+ whole_input_doc = Nokogiri::XML.parse(input_stream)
91
+
92
+ if each_record_xpath
93
+ whole_input_doc.xpath(each_record_xpath, default_namespaces).each do |matching_node|
94
+ # We want to take the matching node, and make it into root in a new Nokogiri document.
95
+ # This is tricky to do as performant as possible (we want to re-use the existing libxml node),
96
+ # while preserving namespaces properly (especially in jruby). Some uses of noko api that seem
97
+ # like they should work don't, esp in jruby.
98
+ child_doc = Nokogiri::XML::Document.new
99
+
100
+ reparent_node_to_root(child_doc, matching_node)
101
+
102
+ yield child_doc
103
+
104
+ child_doc = nil # hopefully make things easier on the GC.
105
+ end
106
+ else
107
+ # caller wants whole doc as a traject source record
108
+ yield whole_input_doc
109
+ end
110
+
111
+ run_extra_xpath_hooks(whole_input_doc)
112
+
113
+ ensure
114
+ # hopefully make things easier on the GC.
115
+ whole_input_doc = nil
116
+ end
117
+
118
+ private
119
+
120
+
121
+ # In MRI Nokogiri, this is as simple as `new_parent_doc.root = node`
122
+ # It seemed maybe safer to dup the node as well as remove the original from the original doc,
123
+ # but I believe this will result in double memory usage, as unlinked nodes aren't GC'd until
124
+ # their doc is. I am hoping this pattern results in less memory usage.
125
+ # https://github.com/sparklemotion/nokogiri/issues/1703
126
+ #
127
+ # However, in JRuby it's a different story, JRuby doesn't properly preserve namespaces
128
+ # when re-parenting a node.
129
+ # https://github.com/sparklemotion/nokogiri/issues/1774
130
+ #
131
+ # The nodes within the tree re-parented _know_ they are in the correct namespaces,
132
+ # and xpath queries require that namespace, but the appropriate xmlns attributes
133
+ # aren't included in the serialized XML. This JRuby-specific code seems to get
134
+ # things back to a consistent state.
135
+ def reparent_node_to_root(new_parent_doc, node)
136
+ if Traject::Util.is_jruby?
137
+ original_ns_scopes = node.namespace_scopes
138
+ end
139
+
140
+ new_parent_doc.root = node
141
+
142
+ if Traject::Util.is_jruby?
143
+ original_ns_scopes.each do |ns|
144
+ if new_parent_doc.at_xpath("//#{ns.prefix}:*", ns.prefix => ns.href)
145
+ new_parent_doc.root.add_namespace(ns.prefix, ns.href)
146
+ end
147
+ end
148
+ end
149
+
150
+ return new_parent_doc
151
+ end
152
+
153
+ def validate_xpath(xpath, key_name:)
154
+ components = each_record_xpath.split('/')
155
+ components.each do |component|
156
+ prefix, element = component.split(':')
157
+ unless element
158
+ # there was no namespace
159
+ prefix, element = nil, prefix
160
+ end
161
+
162
+ if prefix
163
+ ns_uri = default_namespaces[prefix]
164
+ if ns_uri.nil?
165
+ raise ArgumentError, "#{key_name}: Can't find namespace prefix '#{prefix}' in '#{each_record_xpath}'. To use a namespace in each_record_xpath, it has to be registered with nokogiri.namespaces: #{default_namespaces.inspect}"
166
+ end
167
+ end
168
+ end
169
+ end
170
+
171
+ def run_extra_xpath_hooks(noko_doc)
172
+ extra_xpath_hooks.each_pair do |xpath, callable|
173
+ noko_doc.xpath(xpath, default_namespaces).each do |matching_node|
174
+ callable.call(matching_node, clipboard)
175
+ end
176
+ end
177
+ end
178
+ end
179
+ end
@@ -0,0 +1,159 @@
1
+ require 'uri'
2
+ require 'cgi'
3
+ require 'http'
4
+
5
+ module Traject
6
+ # Reads an OAI feed via HTTP and feeds it directly to a traject pipeline. You don't HAVE to use
7
+ # this to read oai-pmh, you might choose to fetch and store OAI-PMH responses to disk yourself,
8
+ # and then process as ordinary XML.
9
+ #
10
+ # Example command line:
11
+ #
12
+ # traject -i xml -r Traject::OaiPmhNokogiriReader -s oai_pmh.start_url="http://example.com/oai?verb=ListRecords&metadataPrefix=oai_dc" -c your_config.rb
13
+ #
14
+ # ## Settings
15
+ #
16
+ # * oai_pmh.start_url: Required, eg "http://example.com/oai?verb=ListRecords&metadataPrefix=oai_dc"
17
+ # * oai_pmh.timeout: (default 10) timeout for http.rb in seconds
18
+ # * oai_pmh.try_gzip: (default true). Ask server for gzip response if available
19
+ # * oai_pmh.http_persistent: (default true). Use persistent HTTP connections.
20
+ #
21
+ # ## JRUBY NOTES:
22
+ # * Does not work with jruby 9.2 until http.rb does: https://github.com/httprb/http/issues/475
23
+ # * JRuby version def reads whole http response into memory before parsing; MRI version might do this too, but might not?
24
+ #
25
+ # ## TO DO
26
+ #
27
+ # This would be a lot more useful with some sort of built-in HTTP caching.
28
+ class OaiPmhNokogiriReader
29
+ include Enumerable
30
+
31
+ attr_reader :settings, :input_stream
32
+
33
+ def initialize(input_stream, settings)
34
+ namespaces = (settings["nokogiri.namespaces"] || {}).merge(
35
+ "oai" => "http://www.openarchives.org/OAI/2.0/"
36
+ )
37
+
38
+
39
+ @settings = Traject::Indexer::Settings.new(
40
+ "nokogiri_reader.extra_xpath_hooks" => extra_xpath_hooks,
41
+ "nokogiri.each_record_xpath" => "/oai:OAI-PMH/oai:ListRecords/oai:record",
42
+ "nokogiri.namespaces" => namespaces
43
+ ).with_defaults(
44
+ "oai_pmh.timeout" => 10,
45
+ "oai_pmh.try_gzip" => true,
46
+ "oai_pmh.http_persistent" => true
47
+ ).fill_in_defaults!.merge(settings)
48
+
49
+ @input_stream = input_stream
50
+ end
51
+
52
+ def start_url
53
+ settings["oai_pmh.start_url"] or raise ArgumentError.new("#{self.class.name} needs a setting 'oai_pmh.start_url'")
54
+ end
55
+
56
+ def start_url_verb
57
+ @start_url_verb ||= (array = CGI.parse(URI.parse(start_url).query)["verb"]) && array.first
58
+ end
59
+
60
+ def extra_xpath_hooks
61
+ @extra_xpath_hooks ||= {
62
+ "//oai:resumptionToken" =>
63
+ lambda do |doc, clipboard|
64
+ token = doc.text
65
+ if token && token != ""
66
+ clipboard[:resumption_token] = token
67
+ end
68
+ end
69
+ }
70
+ end
71
+
72
+ def each
73
+ url = start_url
74
+
75
+ resumption_token = nil
76
+ last_resumption_token = nil
77
+ pages_fetched = 0
78
+
79
+ until url == nil
80
+ resumption_token = read_and_parse_response(url) do |record|
81
+ yield record
82
+ end
83
+ url = resumption_url(resumption_token)
84
+ (last_resumption_token = resumption_token) if resumption_token
85
+ pages_fetched += 1
86
+ end
87
+
88
+ logger.info("#{self.class.name}: fetched #{pages_fetched} pages; last resumptionToken found: #{last_resumption_token.inspect}")
89
+ end
90
+
91
+ def resumption_url(resumption_token)
92
+ return nil if resumption_token.nil? || resumption_token == ""
93
+
94
+ # resumption URL is just original verb with resumption token, that seems to be
95
+ # the oai-pmh spec.
96
+ parsed_uri = URI.parse(start_url)
97
+ parsed_uri.query = "verb=#{CGI.escape start_url_verb}&resumptionToken=#{CGI.escape resumption_token}"
98
+ parsed_uri.to_s
99
+ end
100
+
101
+ def timeout
102
+ settings["oai_pmh.timeout"]
103
+ end
104
+
105
+ def logger
106
+ @logger ||= (@settings[:logger] || Yell.new(STDERR, :level => "gt.fatal")) # null logger)
107
+ end
108
+
109
+ private
110
+
111
+ # re-use an http-client for subsequent requests, to get http.rb's persistent connection re-use
112
+ # Note this means this is NOT thread safe, which is fine for now, but we'd have to do something
113
+ # different if we tried to multi-thread reading multiple files or something.
114
+ #
115
+ # @returns [HTTP::Client] from http.rb gem
116
+ def http_client
117
+ @http_client ||= begin
118
+ # timeout setting on http.rb seems to be a mess.
119
+ # https://github.com/httprb/http/issues/488
120
+ client = HTTP.timeout(:global, write: timeout / 3, connect: timeout / 3, read: timeout / 3)
121
+
122
+ if settings["oai_pmh.try_gzip"]
123
+ client = client.use(:auto_inflate).headers("accept-encoding" => "gzip;q=1.0, identity;q=0.5")
124
+ end
125
+
126
+ if settings["oai_pmh.http_persistent"]
127
+ parsed_uri = URI.parse(start_url)
128
+ client = client.persistent("#{parsed_uri.scheme}://#{parsed_uri.host}")
129
+ end
130
+
131
+ client
132
+ end
133
+ end
134
+
135
+ def read_and_parse_response(url)
136
+ http_response = http_client.get(url)
137
+
138
+ #File.write("our_oai/#{Time.now.to_i}.xml", body)
139
+
140
+ # Not sure why JRuby Nokogiri requires us to call #to_s on it first;
141
+ # not sure if this has perf implications. In either case, not sure
142
+ # if we are reading a separate copy of response into memory, or if Noko
143
+ # consumes it streaming. Trying to explicitly stream it to nokogiri, using
144
+ # http.rb#readpartial, just gave us a big headache.
145
+ noko_source_arg = if Traject::Util.is_jruby?
146
+ http_response.body.to_s
147
+ else
148
+ http_response.body
149
+ end
150
+
151
+ reader = Traject::NokogiriReader.new(noko_source_arg, settings)
152
+
153
+ reader.each { |d| yield d }
154
+
155
+ return reader.clipboard[:resumption_token]
156
+ end
157
+
158
+ end
159
+ end