solr_cursorstream 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 85344a1d5cbdad956770cdb60c76c3304a12f11707ebffc59096abbe403b5d53
4
+ data.tar.gz: 13f93a423feab337b1bde721b0b13bc3335cf4eb976e3d6e04cf629cfa486b0d
5
+ SHA512:
6
+ metadata.gz: e93a0a7dca05d60f9a2f2f6731071a1d9896df6de2899928ebda53c95876cc7b08677c88be7ec9986e848b8c63b622369d6cdd4e57e9f638c40a778671efe500
7
+ data.tar.gz: 1c2b116c552f38d430fda98080f1af9ed728e88eabca60f926d6408c0c593c3258dbab60bebb4cfd3d2f6d5411db54b2d1c2a72419af263cff7a96158a859590
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.rubocop.yml ADDED
@@ -0,0 +1,13 @@
1
+ AllCops:
2
+ TargetRubyVersion: 2.6
3
+
4
+ Style/StringLiterals:
5
+ Enabled: true
6
+ EnforcedStyle: double_quotes
7
+
8
+ Style/StringLiteralsInInterpolation:
9
+ Enabled: true
10
+ EnforcedStyle: double_quotes
11
+
12
+ Layout/LineLength:
13
+ Max: 120
data/CHANGELOG.md ADDED
@@ -0,0 +1,5 @@
1
+ ## [Unreleased]
2
+
3
+ ## [0.1.0] - 2022-06-21
4
+ * Initial release
5
+ * See bottom of README.md for todo list
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ # Specify your gem's dependencies in cursorstream.gemspec
6
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2022 Bill Dueber
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,112 @@
1
+ # Solr::CursorStream
2
+
3
+ "Stream" results from solr with
4
+ [cursor-based fetching](https://solr.apache.org/guide/8_6/pagination-of-resultshtml#fetching-a-large-number-of-sorted-results-cursors),
5
+ exposing the stream as a normal ruby enumerator.
6
+
7
+ Note that this is different from true streaming of results via, e.g.,
8
+ the [default `/export` handler](https://solr.apache.org/guide/8_6/exporting-result-sets.html).
9
+ Those queries can involve more complex processing, but are restricted in
10
+ that you
11
+ * can't use relevancy ranking
12
+ * all fields have to be `docValues`.
13
+
14
+ Cursor-based streaming allows, with some restrictions,
15
+ downloading large sets of data without the "deep paging" problems
16
+ associated with just using the `start` and `rows` parameters.
17
+
18
+ The only significant restrictions is that _the sort specification MUST
19
+ include the`uniqueKey` field_. If you're just downloading a whole dataset and
20
+ don't care about order, the default query of `*:*` and the default sort of `id asc`
21
+ will be fine (assuming your uniqueKey is `id`). If you want to sort by
22
+ another field/value, you must use the uniqueKey in a secondary sort (e.g.,
23
+ `sort: "score desc, id asc"`) to guarantee a stable sort.
24
+
25
+ NOTE that if you don't need the `score` (relevancy) field,
26
+ _use the default query parameter of `*:*`_ so
27
+ solr doesn't have to work as hard. Just put your restrictions in the
28
+ `filters` array.
29
+
30
+ ## Usage
31
+
32
+ ```ruby
33
+ require 'solr/cursorstream'
34
+
35
+ core_url = "http://my.solr.com:8025/solr/mycore/"
36
+
37
+ # Get everything in the solr core, no restrictions
38
+ cs = Solr::CursorStream.new(url: core_url)
39
+ cs.each {|doc| ... }
40
+
41
+ # Filter for newer stuff
42
+ # Note that you need to lucene-escape any q/fq values on your own, since
43
+ # otherwise we'd need a full solr syntax parser to determine which
44
+ # bits to escape.
45
+ cs = Solr::CursorStream.new(url: core_url, filters = ['year:{2010 TO *}'])
46
+
47
+ # Find everything with the phrase "Civil War" in the title and
48
+ # pre-20th century, ordered by year
49
+ cs = Solr::CursorStream.new(url: core_url) do |s|
50
+ s.filters = ['year:[* TO 1900]', 'title:"Civil War"']
51
+ s.sort = 'year asc, id asc' # need to include the uniqueKey field (id)!
52
+ end
53
+
54
+ # #each yields a solr document hash until it runs out
55
+ cs.each {|doc| ... }
56
+
57
+ # The underlying Faraday http connection is available if you need
58
+ # to mess with it directly
59
+ cs.connection.set_basic_auth(user, password)
60
+
61
+ # There are a _lot_ of possible arguments to `new`. It may be easier
62
+ # to specify values in a block
63
+ cs = Solr::CursorStream.new(url: core_url) do |s|
64
+ s.batch_size = 100
65
+ s.fields = %w[id title author year]
66
+ s.filters = ["year:[* TO 1900]"]
67
+ s.query = "title:(Civil War)"
68
+ s.sort = 'score desc, id asc'
69
+ end
70
+
71
+ # Get the first 10_000 results from a query
72
+ cs.each_with_index do |doc, i|
73
+ break if i >= 10_000
74
+ do_someting_with_the_solr_doc(doc)
75
+ end
76
+
77
+ ```
78
+
79
+ ## TODO
80
+
81
+ [ ] Add a :limit option
82
+ [ ] Add a `lucene_escape` utility function
83
+ [ ] Change q/fq to take either a string (as current) or a {field => value} hash
84
+ [ ] Actual error handling, or at least passing useful information along
85
+ [ ] Figure out how to test without a live solr to bounce off of. Maybe use
86
+ vcr or similar?
87
+
88
+ ## Installation
89
+
90
+ Add this line to your application's Gemfile:
91
+
92
+ ```ruby
93
+ gem 'solr_cursorstream'
94
+ ```
95
+
96
+ And then execute:
97
+
98
+ $ bundle install
99
+
100
+ Or install it yourself as:
101
+
102
+ $ gem install solr_cursorstream
103
+
104
+
105
+
106
+ ## Contributing
107
+
108
+ Bug reports and pull requests are welcome on GitHub at https://github.com/mlibrary/solr_cursorstream.
109
+
110
+ ## License
111
+
112
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rspec/core/rake_task"
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ require "standard/rake"
9
+
10
+ task default: [:spec, "standard:fix"]
data/bin/console ADDED
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "bundler/setup"
5
+ require "cursorstream"
6
+
7
+ # You can add fixtures and/or initialization code here to make experimenting
8
+ # with your gem easier. You can also use a different console, if you like.
9
+
10
+ # (If you use this, don't forget to add pry to your Gemfile!)
11
+ # require "pry"
12
+ # Pry.start
13
+
14
+ require "irb"
15
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,30 @@
1
+ ## frozen_string_literal: true
2
+
3
+ require "delegate"
4
+
5
+ # Wrapper around a Faraday::Response that provides sugar methods
6
+ # to get solr docs, numFound, and the cursor value
7
+ class Solr::CursorStream::Response < SimpleDelegator
8
+ # @param [Faraday::Response] faraday_response
9
+ def initialize(faraday_response)
10
+ super
11
+ @base_resp = faraday_response
12
+ @resp = faraday_response.body
13
+ __setobj__(@resp)
14
+ end
15
+
16
+ # @return [Array<Hash>] Array of solr documents returned, as simple hashes
17
+ def docs
18
+ @resp["response"]["docs"]
19
+ end
20
+
21
+ # @return [Integer] Number of documents found for the solr query
22
+ def num_found
23
+ @resp["response"]["numFound"]
24
+ end
25
+
26
+ # @return [String] value of the cursor as returned from solr
27
+ def cursor
28
+ @resp["nextCursorMark"]
29
+ end
30
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Solr
4
+ class CursorStream
5
+ VERSION = "0.1.0"
6
+ end
7
+ end
@@ -0,0 +1,127 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "solr/cursorstream/version"
4
+ require "solr/cursorstream/response"
5
+ require "faraday"
6
+ require "faraday/retry"
7
+
8
+ module Solr
9
+ # Fetch results from a solr filter query via solr's cursor streaming.
10
+ # https://solr.apache.org/guide/8_6/pagination-of-results.html#fetching-a-large-number-of-sorted-results-cursors
11
+ #
12
+ # Note that accessors for things like query, filters, etc. are made available for ease of configuration _only_.
13
+ # Changing anything in the middle of a job will screw up the cursors and leave things undetermined. Just
14
+ # make another CursorStream object.
15
+ class CursorStream
16
+ include Enumerable
17
+
18
+ class Error < StandardError; end
19
+
20
+ attr_accessor :url, :query, :handler, :filters, :sort, :batch_size, :fields, :logger
21
+
22
+ # @param [String] url URL to the solr _core_ (e.g., http://my.machine.com/solr/mycore)
23
+ # @param [String] handler The specific handler to target.
24
+ # @param [Array<String>] filters Array of filter queries to apply.
25
+ # @param [String] sort A valid solr sort string. MUST include the unique field (as per solr docs)
26
+ # @param [Integer] batch_size How many results to fetch at a time (for efficiency)
27
+ # @param [Array<String>] fields The solr fields to return.
28
+ # @param [Logger, #info] A logger or logger-like object. When set to `nil` will not do any logging.
29
+ # @param [Symbol] adapter A valid Faraday adapter. If not using the default httpx, it is up to the
30
+ # programmer to do whatever `require` calls are necessary.
31
+ def initialize(url:, handler: "select", query: "*:*", filters: ["*:*"], sort: "id asc", batch_size: 100, fields: [], logger: nil, adapter: :httpx)
32
+ @url = url.gsub(/\/\Z/, "")
33
+ @query = query
34
+ @handler = handler
35
+ @filters = filters
36
+ @sort = sort
37
+ @batch_size = batch_size
38
+ @fields = fields
39
+ @logger = logger
40
+ @adapter = adapter
41
+
42
+ @current_cursor = "*"
43
+ yield self if block_given?
44
+ end
45
+
46
+ # @return String solr url build from the passed url and the handler
47
+ def solr_url
48
+ url + "/" + handler
49
+ end
50
+
51
+ # Iterate through the documents in the stream. Behind the scenes, these will be fetched in batches
52
+ # of `batch_size` for efficiency.
53
+ # @yieldreturn [Hash] A single solr document from the stream
54
+ def each
55
+ return enum_for(:each) unless block_given?
56
+ verify_we_have_everything!
57
+ while solr_has_more?
58
+ cursor_response = get_page
59
+ cursor_response.docs.each { |d| yield d }
60
+ end
61
+ end
62
+
63
+ # Build up a Faraday connection
64
+ # @param [Symbol] adapter Which faraday adapter to use. If not :httpx, you must have loaded the
65
+ # necessary adapter already.
66
+ # @return [Faraday::Connection] A faraday connection object.
67
+ def self.connection(adapter: :httpx)
68
+ require "httpx/adapters/faraday" if adapter == :httpx
69
+ Faraday.new(request: {params_encoder: Faraday::FlatParamsEncoder}) do |builder|
70
+ builder.use Faraday::Response::RaiseError
71
+ builder.request :url_encoded
72
+ builder.request :retry
73
+ builder.response :json
74
+ builder.adapter @adapter
75
+ end
76
+ end
77
+
78
+ # @see CursorStream.connection
79
+ def connection(adapter: @adapter)
80
+ return @connection if @connection
81
+ @connection = self.class.connection(adapter: @adapter)
82
+ end
83
+
84
+ # @private
85
+ # Get a single "page" (`batch_size` documents) from solr. Feeds into #each
86
+ # @return [CursorResponse]
87
+ def get_page
88
+ params = {cursorMark: @current_cursor}.merge default_params
89
+ r = connection.get(solr_url, params)
90
+ resp = Response.new(r)
91
+ @last_cursor = @current_cursor
92
+ @current_cursor = resp.cursor
93
+ resp
94
+ end
95
+
96
+ # @private
97
+ # @return [Hash] Default solr params derived from instance variables
98
+ def default_params
99
+ field_list = Array(fields).join(",")
100
+ p = {q: @query, wt: :json, rows: batch_size, sort: @sort, fq: filters, fl: field_list}
101
+ p.reject { |_k, v| [nil, "", []].include?(v) }
102
+ p
103
+ end
104
+
105
+ # @private
106
+ # Make sure we have everything we need for a successful stream
107
+ def verify_we_have_everything!
108
+ missing = {handler: @handler, filters: @filters, batch_size: @batch_size}.select { |_k, v| v.nil? }.keys
109
+ raise Error.new("Solr::CursorStreamer missing value for #{missing.join(", ")}") unless missing.empty?
110
+ end
111
+
112
+ # @private
113
+ # Determine if solr has another page of results
114
+ # @return [Boolean]
115
+ def solr_has_more?
116
+ @last_cursor != @current_cursor
117
+ end
118
+
119
+ # @private
120
+ # @return Lambda that runs every time the connection needs to retry due to http error
121
+ def http_request_retry_block
122
+ ->(env:, options:, retries_remaining:, exception:, will_retry_in:) do
123
+ # TODO: log that a retry happened
124
+ end
125
+ end
126
+ end
127
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/solr/cursorstream/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "solr_cursorstream"
7
+ spec.version = Solr::CursorStream::VERSION
8
+ spec.authors = ["Bill Dueber"]
9
+ spec.email = ["bill@dueber.com"]
10
+
11
+ spec.summary = "Get an iterator on a solr filter using stream/cursor"
12
+ spec.homepage = "https://github.com/mlibrary/solr_cursorstream"
13
+ spec.license = "MIT"
14
+
15
+ spec.metadata["homepage_uri"] = spec.homepage
16
+ spec.metadata["source_code_uri"] = spec.homepage
17
+ spec.metadata["changelog_uri"] = spec.homepage + "/CHANGELOG.md"
18
+
19
+ # Specify which files should be added to the gem when it is released.
20
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
21
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
22
+ `git ls-files -z`.split("\x0").reject do |f|
23
+ (f == __FILE__) || f.match(%r{\A(?:(?:test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
24
+ end
25
+ end
26
+ spec.bindir = "exe"
27
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
28
+ spec.require_paths = ["lib"]
29
+
30
+ spec.add_dependency "faraday"
31
+ spec.add_dependency "faraday-retry"
32
+ spec.add_dependency "httpx"
33
+ spec.add_dependency "milemarker"
34
+
35
+ spec.add_development_dependency "pry"
36
+ spec.add_development_dependency "rake", "~> 13.0"
37
+ spec.add_development_dependency "rspec", "~> 3.0"
38
+ spec.add_development_dependency "standard"
39
+ end
metadata ADDED
@@ -0,0 +1,171 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: solr_cursorstream
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Bill Dueber
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2022-06-21 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: faraday
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: faraday-retry
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: httpx
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: milemarker
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: pry
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rake
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '13.0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '13.0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rspec
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '3.0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '3.0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: standard
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ description:
126
+ email:
127
+ - bill@dueber.com
128
+ executables: []
129
+ extensions: []
130
+ extra_rdoc_files: []
131
+ files:
132
+ - ".rspec"
133
+ - ".rubocop.yml"
134
+ - CHANGELOG.md
135
+ - Gemfile
136
+ - LICENSE.txt
137
+ - README.md
138
+ - Rakefile
139
+ - bin/console
140
+ - bin/setup
141
+ - lib/solr/cursorstream.rb
142
+ - lib/solr/cursorstream/response.rb
143
+ - lib/solr/cursorstream/version.rb
144
+ - solr_cursorstream.gemspec
145
+ homepage: https://github.com/mlibrary/solr_cursorstream
146
+ licenses:
147
+ - MIT
148
+ metadata:
149
+ homepage_uri: https://github.com/mlibrary/solr_cursorstream
150
+ source_code_uri: https://github.com/mlibrary/solr_cursorstream
151
+ changelog_uri: https://github.com/mlibrary/solr_cursorstream/CHANGELOG.md
152
+ post_install_message:
153
+ rdoc_options: []
154
+ require_paths:
155
+ - lib
156
+ required_ruby_version: !ruby/object:Gem::Requirement
157
+ requirements:
158
+ - - ">="
159
+ - !ruby/object:Gem::Version
160
+ version: '0'
161
+ required_rubygems_version: !ruby/object:Gem::Requirement
162
+ requirements:
163
+ - - ">="
164
+ - !ruby/object:Gem::Version
165
+ version: '0'
166
+ requirements: []
167
+ rubygems_version: 3.1.2
168
+ signing_key:
169
+ specification_version: 4
170
+ summary: Get an iterator on a solr filter using stream/cursor
171
+ test_files: []