traject 2.3.4 → 3.0.0.alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +16 -9
  3. data/CHANGES.md +74 -1
  4. data/Gemfile +2 -1
  5. data/README.md +104 -53
  6. data/Rakefile +8 -1
  7. data/doc/indexing_rules.md +79 -63
  8. data/doc/programmatic_use.md +218 -0
  9. data/doc/settings.md +28 -1
  10. data/doc/xml.md +134 -0
  11. data/lib/traject.rb +5 -0
  12. data/lib/traject/array_writer.rb +34 -0
  13. data/lib/traject/command_line.rb +18 -22
  14. data/lib/traject/debug_writer.rb +2 -5
  15. data/lib/traject/experimental_nokogiri_streaming_reader.rb +276 -0
  16. data/lib/traject/hashie/indifferent_access_fix.rb +25 -0
  17. data/lib/traject/indexer.rb +321 -92
  18. data/lib/traject/indexer/context.rb +39 -13
  19. data/lib/traject/indexer/marc_indexer.rb +30 -0
  20. data/lib/traject/indexer/nokogiri_indexer.rb +30 -0
  21. data/lib/traject/indexer/settings.rb +36 -53
  22. data/lib/traject/indexer/step.rb +27 -33
  23. data/lib/traject/macros/marc21.rb +37 -12
  24. data/lib/traject/macros/nokogiri_macros.rb +43 -0
  25. data/lib/traject/macros/transformation.rb +162 -0
  26. data/lib/traject/marc_extractor.rb +2 -0
  27. data/lib/traject/ndj_reader.rb +1 -1
  28. data/lib/traject/nokogiri_reader.rb +179 -0
  29. data/lib/traject/oai_pmh_nokogiri_reader.rb +159 -0
  30. data/lib/traject/solr_json_writer.rb +19 -12
  31. data/lib/traject/thread_pool.rb +13 -0
  32. data/lib/traject/util.rb +14 -2
  33. data/lib/traject/version.rb +1 -1
  34. data/test/debug_writer_test.rb +3 -3
  35. data/test/delimited_writer_test.rb +3 -3
  36. data/test/experimental_nokogiri_streaming_reader_test.rb +169 -0
  37. data/test/indexer/context_test.rb +23 -13
  38. data/test/indexer/error_handler_test.rb +59 -0
  39. data/test/indexer/macros/macros_marc21_semantics_test.rb +46 -46
  40. data/test/indexer/macros/marc21/extract_all_marc_values_test.rb +1 -1
  41. data/test/indexer/macros/marc21/extract_marc_test.rb +19 -9
  42. data/test/indexer/macros/marc21/serialize_marc_test.rb +4 -4
  43. data/test/indexer/macros/to_field_test.rb +2 -2
  44. data/test/indexer/macros/transformation_test.rb +177 -0
  45. data/test/indexer/map_record_test.rb +2 -3
  46. data/test/indexer/nokogiri_indexer_test.rb +103 -0
  47. data/test/indexer/process_record_test.rb +55 -0
  48. data/test/indexer/process_with_test.rb +148 -0
  49. data/test/indexer/read_write_test.rb +52 -2
  50. data/test/indexer/settings_test.rb +34 -24
  51. data/test/indexer/to_field_test.rb +27 -2
  52. data/test/marc_extractor_test.rb +7 -7
  53. data/test/marc_reader_test.rb +4 -4
  54. data/test/nokogiri_reader_test.rb +158 -0
  55. data/test/oai_pmh_nokogiri_reader_test.rb +23 -0
  56. data/test/solr_json_writer_test.rb +24 -28
  57. data/test/test_helper.rb +8 -2
  58. data/test/test_support/namespace-test.xml +7 -0
  59. data/test/test_support/nokogiri_demo_config.rb +17 -0
  60. data/test/test_support/oai-pmh-one-record-2.xml +24 -0
  61. data/test/test_support/oai-pmh-one-record-first.xml +24 -0
  62. data/test/test_support/sample-oai-no-namespace.xml +197 -0
  63. data/test/test_support/sample-oai-pmh.xml +197 -0
  64. data/test/thread_pool_test.rb +38 -0
  65. data/test/translation_map_test.rb +3 -3
  66. data/test/translation_maps/ruby_map.rb +2 -1
  67. data/test/translation_maps/yaml_map.yaml +2 -1
  68. data/traject.gemspec +4 -11
  69. metadata +92 -6
@@ -0,0 +1,43 @@
1
+ module Traject
2
+ module Macros
3
+ module NokogiriMacros
4
+
5
+ def default_namespaces
6
+ @default_namespaces ||= (settings["nokogiri.namespaces"] || {}).tap { |ns|
7
+ unless ns.kind_of?(Hash)
8
+ raise ArgumentError, "nokogiri.namespaces must be a hash, not: #{ns.inspect}"
9
+ end
10
+ }
11
+ end
12
+
13
+ def extract_xpath(xpath, ns: {}, to_text: true)
14
+ if ns && ns.length > 0
15
+ namespaces = default_namespaces.merge(ns)
16
+ else
17
+ namespaces = default_namespaces
18
+ end
19
+
20
+ lambda do |record, accumulator|
21
+ result = record.xpath(xpath, namespaces)
22
+
23
+ if to_text
24
+ # take all matches, for each match take all
25
+ # text content, join it together separated with spaces
26
+ # Make sure to avoid text content that was all blank, which is "between the children"
27
+ # whitespace.
28
+ result = result.collect do |n|
29
+ n.xpath('.//text()').collect(&:text).tap do |arr|
30
+ arr.reject! { |s| s =~ (/\A\s+\z/) }
31
+ end.join(" ")
32
+ end
33
+ else
34
+ # just put all matches in accumulator as Nokogiri::XML::Node's
35
+ result = result.to_a
36
+ end
37
+
38
+ accumulator.concat result
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,162 @@
1
+ module Traject
2
+ module Macros
3
+ # Macros intended to be mixed into an Indexer and used in config
4
+ # as second or further args to #to_field, to transform existing accumulator values.
5
+ #
6
+ # They have the same form as any proc/block passed to #to_field, but
7
+ # operate on an existing accumulator, intended to be used as non-first-step
8
+ # transformations.
9
+ #
10
+ # Some of these are extracted from extract_marc options, so they can be
11
+ # used with any first-step extract methods. Some informed by current users.
12
+ module Transformation
13
+
14
+ # Maps all values on accumulator through a Traject::TranslationMap.
15
+ #
16
+ # A Traject::TranslationMap is hash-like mapping from input to output, usually
17
+ # defined in a yaml or dot-properties file, which can be looked up in load path
18
+ # with a file name as arg. See [Traject::TranslationMap](../translation_map.rb)
19
+ # header coments for details.
20
+ #
21
+ # Using this macro, you can pass in one TranslationMap initializer arg, but you can
22
+ # also pass in multiple, and they will be merged into each other (last one last), so
23
+ # you can use this to apply over-rides: Either from another on-disk map, or even from
24
+ # an inline hash (since a Hash is a valid TranslationMap initialization arg too).
25
+ #
26
+ # @example
27
+ # to_field("something"), to_field "cataloging_agency", extract_marc("040a"), translation_map("marc_040a")
28
+ #
29
+ # @example with override
30
+ # to_field("something"), to_field "cataloging_agency", extract_marc("040a"), translation_map("marc_040a", "local_marc_040a")
31
+ #
32
+ # @example with multiple overrides, including local hash
33
+ # to_field("something"), to_field "cataloging_agency", extract_marc("040a"), translation_map("marc_040a", "local_marc_040a", {"DLC" => "U.S. LoC"})
34
+ def translation_map(*translation_map_specifier)
35
+ translation_map = translation_map_specifier.
36
+ collect { |spec| Traject::TranslationMap.new(spec) }.
37
+ reduce(:merge)
38
+
39
+ lambda do |rec, acc|
40
+ translation_map.translate_array! acc
41
+ end
42
+ end
43
+
44
+ # Pass in a proc/lambda arg or a block (or both), that will be called on each
45
+ # value already in the accumulator, to transform it. (Ie, with `#map!`/`#collect!` on your proc(s)).
46
+ #
47
+ # Due to how ruby syntax precedence works, the block form is probably not too useful
48
+ # in traject config files, except with the `&:` trick.
49
+ #
50
+ # The "stabby lambda" may be convenient for passing an explicit proc argument.
51
+ #
52
+ # You can pass both an explicit proc arg and a block, in which case the proc arg
53
+ # will be applied first.
54
+ #
55
+ # @example
56
+ # to_field("something"), extract_marc("something"), transform(&:upcase)
57
+ #
58
+ # @example
59
+ # to_field("something"), extract_marc("something"), transform(->(val) { val.tr('^a-z', "\uFFFD") })
60
+ def transform(a_proc=nil, &block)
61
+ unless a_proc || block
62
+ raise ArgumentError, "Needs a transform proc arg or block arg"
63
+ end
64
+
65
+ transformer_callable = if a_proc && block
66
+ # need to make a combo wrapper.
67
+ ->(val) { block.call(a_proc.call(val)) }
68
+ elsif a_proc
69
+ a_proc
70
+ else
71
+ block
72
+ end
73
+
74
+ lambda do |rec, acc|
75
+ acc.collect! do |value|
76
+ transformer_callable.call(value)
77
+ end
78
+ end
79
+ end
80
+
81
+ # Adds a literal to accumulator if accumulator was empty
82
+ #
83
+ # @example
84
+ # to_field "title", extract_marc("245abc"), default("Unknown Title")
85
+ def default(default_value)
86
+ lambda do |rec, acc|
87
+ if acc.empty?
88
+ acc << default_value
89
+ end
90
+ end
91
+ end
92
+
93
+ # Removes all but the first value from accumulator, if more values were present.
94
+ #
95
+ # @example
96
+ # to_field "main_author", extract_marc("100"), first_only
97
+ def first_only
98
+ lambda do |rec, acc|
99
+ # kind of esoteric, but slice used this way does mutating first, yep
100
+ acc.slice!(1, acc.length)
101
+ end
102
+ end
103
+
104
+
105
+ # calls ruby `uniq!` on accumulator, removes any duplicate values
106
+ #
107
+ # @example
108
+ # to_field "something", extract_marc("245:240"), unique
109
+ def unique
110
+ lambda do |rec, acc|
111
+ acc.uniq!
112
+ end
113
+ end
114
+
115
+
116
+ # For each value in accumulator, remove all leading or trailing whitespace
117
+ # (unique aware). Like ruby #strip, but whitespace-aware
118
+ #
119
+ # @example
120
+ # to_field "title", extract_marc("245"), strip
121
+ def strip
122
+ lambda do |rec, acc|
123
+ acc.collect! do |v|
124
+ # unicode whitespace class aware
125
+ v.sub(/\A[[:space:]]+/,'').sub(/[[:space:]]+\Z/, '')
126
+ end
127
+ end
128
+ end
129
+
130
+ # Run ruby `split` on each value in the accumulator, with separator
131
+ # given, flatten all results into single array as accumulator.
132
+ # Will generally result in more individual values in accumulator as output than were
133
+ # there in input, as input values are split up into multiple values.
134
+ def split(separator)
135
+ lambda do |rec, acc|
136
+ acc.replace( acc.flat_map { |v| v.split(separator) } )
137
+ end
138
+ end
139
+
140
+ # Append argument to end of each value in accumulator.
141
+ def append(suffix)
142
+ lambda do |rec, acc|
143
+ acc.collect! { |v| v + suffix }
144
+ end
145
+ end
146
+
147
+ # prepend argument to beginning of each value in accumulator.
148
+ def prepend(prefix)
149
+ lambda do |rec, acc|
150
+ acc.collect! { |v| prefix + v }
151
+ end
152
+ end
153
+
154
+ # Run ruby `gsub` on each value in accumulator, with pattern and replace value given.
155
+ def gsub(pattern, replace)
156
+ lambda do |rec, acc|
157
+ acc.collect! { |v| v.gsub(pattern, replace) }
158
+ end
159
+ end
160
+ end
161
+ end
162
+ end
@@ -151,6 +151,8 @@ module Traject
151
151
  if options[:alternate_script] != false
152
152
  @fetch_alternate_script = true
153
153
  show_interest_in_tag(ALTERNATE_SCRIPT_TAG)
154
+ else
155
+ @fetch_alternate_script = false
154
156
  end
155
157
 
156
158
  @interesting_tags_list = @interesting_tags_hash.keys
@@ -12,7 +12,7 @@ class Traject::NDJReader
12
12
  def initialize(input_stream, settings)
13
13
  @settings = settings
14
14
  @input_stream = input_stream
15
- if /\.gz\Z/.match(@settings['command_line.filename'])
15
+ if input_stream.respond_to?(:path) && /\.gz\Z/.match(input_stream.path)
16
16
  @input_stream = Zlib::GzipReader.new(@input_stream, :external_encoding => "UTF-8")
17
17
  end
18
18
  end
@@ -0,0 +1,179 @@
1
+ module Traject
2
+ # A Trajet reader which reads XML, and yields zero to many Nokogiri::XML::Document
3
+ # objects as source records in the traject pipeline.
4
+ #
5
+ # It does process the entire input document with Nokogiri::XML.parse, DOM-parsing,
6
+ # so will take RAM for the entire input document, until iteration completes.
7
+ # (There is a separate half-finished `ExperimentalStreamingNokogiriReader` available, but it is
8
+ # experimental, half-finished, may disappear or change in backwards compat at any time, problematic,
9
+ # not recommended for production use, etc.)
10
+ #
11
+ # You can have it yield the _entire_ input XML as a single traject source record
12
+ # (default), or you can use setting `nokogiri.each_record_xpath` to split
13
+ # the source up into separate records to yield into traject pipeline -- each one
14
+ # will be it's own Nokogiri::XML::Document.
15
+ #
16
+ # ## Settings
17
+ # * nokogiri.default_namespaces: Set namespace prefixes that can be used in
18
+ # other settings, including `extract_xpath` from NokogiriMacros.
19
+ # * nokogiri.each_record_xpath: if set to a string xpath, will take all matching nodes
20
+ # from the input doc, and yield the individually as source records to the pipeline.
21
+ # If you need to use namespaces here, you need to have them registered with
22
+ # `nokogiri.default_namespaces`. If your source docs use namespaces, you DO need
23
+ # to use them in your each_record_xpath.
24
+ # * nokogiri_reader.extra_xpath_hooks: Experimental in progress, see below.
25
+ #
26
+ # ## nokogiri_reader.extra_xpath_hooks: For handling nodes outside of your each_record_xpath
27
+ #
28
+ # What if you want to use each_record_xpath to yield certain nodes as source documents, but
29
+ # there is additional some other info in other parts of the input document you need? This came up
30
+ # when developing the OaiPmhNokogiriReader, which yields "//oai:record" as pipeline source documents,
31
+ # but also needed to look at "//oai:resumptionToken" to scrape the entire results.
32
+ #
33
+ # There is a semi-finished/in-progress feature that meets that use case -- unclear if it will meet
34
+ # all use cases for this general issue.
35
+ #
36
+ # Setting `nokogiri_reader.extra_xpath_hooks` can be set to a Hash where the keys are xpaths (if using
37
+ # namespaces must be must be registered with `nokogiri.default_namespaces`), and the value is a lambda/
38
+ # proc/callable object, taking two arguments.
39
+ #
40
+ # provide "nokogiri_reader.extra_xpath_hooks", {
41
+ # "//oai:resumptionToken" =>
42
+ # lambda do |node, clipboard|
43
+ # clipboard[:resumption_token] = node.text
44
+ # end"
45
+ # }
46
+ #
47
+ # The first arg is the matching node. What's this clipboard? Well, what are you
48
+ # gonna _do_ with what you get out of there, that you can do in a thread-safe way
49
+ # in the middle of nokogiri processing? The second arg is a thread-safe Hash "clipboard"
50
+ # that you can store things in, and later access via reader.clipboard.
51
+ #
52
+ # There's no great thread-safe way to get reader.clipboard in a normal nokogiri pipeline though,
53
+ # (the reader can change in multi-file handling so there can be a race condition if you try naively,
54
+ # don't!) Which is why this feature needs some work for general applicability. The OaiPmhReader
55
+ # manually creates it's readers outside the usual nokogiri flow, so can use it.
56
+ class NokogiriReader
57
+ include Enumerable
58
+
59
+ attr_reader :settings, :input_stream, :clipboard, :path_tracker
60
+
61
+ def initialize(input_stream, settings)
62
+ @settings = Traject::Indexer::Settings.new settings
63
+ @input_stream = input_stream
64
+ @clipboard = Traject::Util.is_jruby? ? Concurrent::Map.new : Concurrent::Hash.new
65
+
66
+ default_namespaces # trigger validation
67
+ validate_xpath(each_record_xpath, key_name: "each_record_xpath") if each_record_xpath
68
+ extra_xpath_hooks.each_pair do |xpath, _callable|
69
+ validate_xpath(xpath, key_name: "extra_xpath_hooks")
70
+ end
71
+ end
72
+
73
+ def each_record_xpath
74
+ @each_record_xpath ||= settings["nokogiri.each_record_xpath"]
75
+ end
76
+
77
+ def extra_xpath_hooks
78
+ @extra_xpath_hooks ||= settings["nokogiri_reader.extra_xpath_hooks"] || {}
79
+ end
80
+
81
+ def default_namespaces
82
+ @default_namespaces ||= (settings["nokogiri.namespaces"] || {}).tap { |ns|
83
+ unless ns.kind_of?(Hash)
84
+ raise ArgumentError, "nokogiri.namespaces must be a hash, not: #{ns.inspect}"
85
+ end
86
+ }
87
+ end
88
+
89
+ def each
90
+ whole_input_doc = Nokogiri::XML.parse(input_stream)
91
+
92
+ if each_record_xpath
93
+ whole_input_doc.xpath(each_record_xpath, default_namespaces).each do |matching_node|
94
+ # We want to take the matching node, and make it into root in a new Nokogiri document.
95
+ # This is tricky to do as performant as possible (we want to re-use the existing libxml node),
96
+ # while preserving namespaces properly (especially in jruby). Some uses of noko api that seem
97
+ # like they should work don't, esp in jruby.
98
+ child_doc = Nokogiri::XML::Document.new
99
+
100
+ reparent_node_to_root(child_doc, matching_node)
101
+
102
+ yield child_doc
103
+
104
+ child_doc = nil # hopefully make things easier on the GC.
105
+ end
106
+ else
107
+ # caller wants whole doc as a traject source record
108
+ yield whole_input_doc
109
+ end
110
+
111
+ run_extra_xpath_hooks(whole_input_doc)
112
+
113
+ ensure
114
+ # hopefully make things easier on the GC.
115
+ whole_input_doc = nil
116
+ end
117
+
118
+ private
119
+
120
+
121
+ # In MRI Nokogiri, this is as simple as `new_parent_doc.root = node`
122
+ # It seemed maybe safer to dup the node as well as remove the original from the original doc,
123
+ # but I believe this will result in double memory usage, as unlinked nodes aren't GC'd until
124
+ # their doc is. I am hoping this pattern results in less memory usage.
125
+ # https://github.com/sparklemotion/nokogiri/issues/1703
126
+ #
127
+ # However, in JRuby it's a different story, JRuby doesn't properly preserve namespaces
128
+ # when re-parenting a node.
129
+ # https://github.com/sparklemotion/nokogiri/issues/1774
130
+ #
131
+ # The nodes within the tree re-parented _know_ they are in the correct namespaces,
132
+ # and xpath queries require that namespace, but the appropriate xmlns attributes
133
+ # aren't included in the serialized XML. This JRuby-specific code seems to get
134
+ # things back to a consistent state.
135
+ def reparent_node_to_root(new_parent_doc, node)
136
+ if Traject::Util.is_jruby?
137
+ original_ns_scopes = node.namespace_scopes
138
+ end
139
+
140
+ new_parent_doc.root = node
141
+
142
+ if Traject::Util.is_jruby?
143
+ original_ns_scopes.each do |ns|
144
+ if new_parent_doc.at_xpath("//#{ns.prefix}:*", ns.prefix => ns.href)
145
+ new_parent_doc.root.add_namespace(ns.prefix, ns.href)
146
+ end
147
+ end
148
+ end
149
+
150
+ return new_parent_doc
151
+ end
152
+
153
+ def validate_xpath(xpath, key_name:)
154
+ components = each_record_xpath.split('/')
155
+ components.each do |component|
156
+ prefix, element = component.split(':')
157
+ unless element
158
+ # there was no namespace
159
+ prefix, element = nil, prefix
160
+ end
161
+
162
+ if prefix
163
+ ns_uri = default_namespaces[prefix]
164
+ if ns_uri.nil?
165
+ raise ArgumentError, "#{key_name}: Can't find namespace prefix '#{prefix}' in '#{each_record_xpath}'. To use a namespace in each_record_xpath, it has to be registered with nokogiri.namespaces: #{default_namespaces.inspect}"
166
+ end
167
+ end
168
+ end
169
+ end
170
+
171
+ def run_extra_xpath_hooks(noko_doc)
172
+ extra_xpath_hooks.each_pair do |xpath, callable|
173
+ noko_doc.xpath(xpath, default_namespaces).each do |matching_node|
174
+ callable.call(matching_node, clipboard)
175
+ end
176
+ end
177
+ end
178
+ end
179
+ end
@@ -0,0 +1,159 @@
1
+ require 'uri'
2
+ require 'cgi'
3
+ require 'http'
4
+
5
+ module Traject
6
+ # Reads an OAI feed via HTTP and feeds it directly to a traject pipeline. You don't HAVE to use
7
+ # this to read oai-pmh, you might choose to fetch and store OAI-PMH responses to disk yourself,
8
+ # and then process as ordinary XML.
9
+ #
10
+ # Example command line:
11
+ #
12
+ # traject -i xml -r Traject::OaiPmhNokogiriReader -s oai_pmh.start_url="http://example.com/oai?verb=ListRecords&metadataPrefix=oai_dc" -c your_config.rb
13
+ #
14
+ # ## Settings
15
+ #
16
+ # * oai_pmh.start_url: Required, eg "http://example.com/oai?verb=ListRecords&metadataPrefix=oai_dc"
17
+ # * oai_pmh.timeout: (default 10) timeout for http.rb in seconds
18
+ # * oai_pmh.try_gzip: (default true). Ask server for gzip response if available
19
+ # * oai_pmh.http_persistent: (default true). Use persistent HTTP connections.
20
+ #
21
+ # ## JRUBY NOTES:
22
+ # * Does not work with jruby 9.2 until http.rb does: https://github.com/httprb/http/issues/475
23
+ # * JRuby version def reads whole http response into memory before parsing; MRI version might do this too, but might not?
24
+ #
25
+ # ## TO DO
26
+ #
27
+ # This would be a lot more useful with some sort of built-in HTTP caching.
28
+ class OaiPmhNokogiriReader
29
+ include Enumerable
30
+
31
+ attr_reader :settings, :input_stream
32
+
33
+ def initialize(input_stream, settings)
34
+ namespaces = (settings["nokogiri.namespaces"] || {}).merge(
35
+ "oai" => "http://www.openarchives.org/OAI/2.0/"
36
+ )
37
+
38
+
39
+ @settings = Traject::Indexer::Settings.new(
40
+ "nokogiri_reader.extra_xpath_hooks" => extra_xpath_hooks,
41
+ "nokogiri.each_record_xpath" => "/oai:OAI-PMH/oai:ListRecords/oai:record",
42
+ "nokogiri.namespaces" => namespaces
43
+ ).with_defaults(
44
+ "oai_pmh.timeout" => 10,
45
+ "oai_pmh.try_gzip" => true,
46
+ "oai_pmh.http_persistent" => true
47
+ ).fill_in_defaults!.merge(settings)
48
+
49
+ @input_stream = input_stream
50
+ end
51
+
52
+ def start_url
53
+ settings["oai_pmh.start_url"] or raise ArgumentError.new("#{self.class.name} needs a setting 'oai_pmh.start_url'")
54
+ end
55
+
56
+ def start_url_verb
57
+ @start_url_verb ||= (array = CGI.parse(URI.parse(start_url).query)["verb"]) && array.first
58
+ end
59
+
60
+ def extra_xpath_hooks
61
+ @extra_xpath_hooks ||= {
62
+ "//oai:resumptionToken" =>
63
+ lambda do |doc, clipboard|
64
+ token = doc.text
65
+ if token && token != ""
66
+ clipboard[:resumption_token] = token
67
+ end
68
+ end
69
+ }
70
+ end
71
+
72
+ def each
73
+ url = start_url
74
+
75
+ resumption_token = nil
76
+ last_resumption_token = nil
77
+ pages_fetched = 0
78
+
79
+ until url == nil
80
+ resumption_token = read_and_parse_response(url) do |record|
81
+ yield record
82
+ end
83
+ url = resumption_url(resumption_token)
84
+ (last_resumption_token = resumption_token) if resumption_token
85
+ pages_fetched += 1
86
+ end
87
+
88
+ logger.info("#{self.class.name}: fetched #{pages_fetched} pages; last resumptionToken found: #{last_resumption_token.inspect}")
89
+ end
90
+
91
+ def resumption_url(resumption_token)
92
+ return nil if resumption_token.nil? || resumption_token == ""
93
+
94
+ # resumption URL is just original verb with resumption token, that seems to be
95
+ # the oai-pmh spec.
96
+ parsed_uri = URI.parse(start_url)
97
+ parsed_uri.query = "verb=#{CGI.escape start_url_verb}&resumptionToken=#{CGI.escape resumption_token}"
98
+ parsed_uri.to_s
99
+ end
100
+
101
+ def timeout
102
+ settings["oai_pmh.timeout"]
103
+ end
104
+
105
+ def logger
106
+ @logger ||= (@settings[:logger] || Yell.new(STDERR, :level => "gt.fatal")) # null logger)
107
+ end
108
+
109
+ private
110
+
111
+ # re-use an http-client for subsequent requests, to get http.rb's persistent connection re-use
112
+ # Note this means this is NOT thread safe, which is fine for now, but we'd have to do something
113
+ # different if we tried to multi-thread reading multiple files or something.
114
+ #
115
+ # @returns [HTTP::Client] from http.rb gem
116
+ def http_client
117
+ @http_client ||= begin
118
+ # timeout setting on http.rb seems to be a mess.
119
+ # https://github.com/httprb/http/issues/488
120
+ client = HTTP.timeout(:global, write: timeout / 3, connect: timeout / 3, read: timeout / 3)
121
+
122
+ if settings["oai_pmh.try_gzip"]
123
+ client = client.use(:auto_inflate).headers("accept-encoding" => "gzip;q=1.0, identity;q=0.5")
124
+ end
125
+
126
+ if settings["oai_pmh.http_persistent"]
127
+ parsed_uri = URI.parse(start_url)
128
+ client = client.persistent("#{parsed_uri.scheme}://#{parsed_uri.host}")
129
+ end
130
+
131
+ client
132
+ end
133
+ end
134
+
135
+ def read_and_parse_response(url)
136
+ http_response = http_client.get(url)
137
+
138
+ #File.write("our_oai/#{Time.now.to_i}.xml", body)
139
+
140
+ # Not sure why JRuby Nokogiri requires us to call #to_s on it first;
141
+ # not sure if this has perf implications. In either case, not sure
142
+ # if we are reading a separate copy of response into memory, or if Noko
143
+ # consumes it streaming. Trying to explicitly stream it to nokogiri, using
144
+ # http.rb#readpartial, just gave us a big headache.
145
+ noko_source_arg = if Traject::Util.is_jruby?
146
+ http_response.body.to_s
147
+ else
148
+ http_response.body
149
+ end
150
+
151
+ reader = Traject::NokogiriReader.new(noko_source_arg, settings)
152
+
153
+ reader.each { |d| yield d }
154
+
155
+ return reader.clipboard[:resumption_token]
156
+ end
157
+
158
+ end
159
+ end