solr_cursorstream 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 85344a1d5cbdad956770cdb60c76c3304a12f11707ebffc59096abbe403b5d53
4
+ data.tar.gz: 13f93a423feab337b1bde721b0b13bc3335cf4eb976e3d6e04cf629cfa486b0d
5
+ SHA512:
6
+ metadata.gz: e93a0a7dca05d60f9a2f2f6731071a1d9896df6de2899928ebda53c95876cc7b08677c88be7ec9986e848b8c63b622369d6cdd4e57e9f638c40a778671efe500
7
+ data.tar.gz: 1c2b116c552f38d430fda98080f1af9ed728e88eabca60f926d6408c0c593c3258dbab60bebb4cfd3d2f6d5411db54b2d1c2a72419af263cff7a96158a859590
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.rubocop.yml ADDED
@@ -0,0 +1,13 @@
1
+ AllCops:
2
+ TargetRubyVersion: 2.6
3
+
4
+ Style/StringLiterals:
5
+ Enabled: true
6
+ EnforcedStyle: double_quotes
7
+
8
+ Style/StringLiteralsInInterpolation:
9
+ Enabled: true
10
+ EnforcedStyle: double_quotes
11
+
12
+ Layout/LineLength:
13
+ Max: 120
data/CHANGELOG.md ADDED
@@ -0,0 +1,5 @@
1
+ ## [Unreleased]
2
+
3
+ ## [0.1.0] - 2022-06-21
4
+ * Initial release
5
+ * See bottom of README.md for todo list
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ # Specify your gem's dependencies in cursorstream.gemspec
6
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2022 Bill Dueber
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,112 @@
1
+ # Solr::CursorStream
2
+
3
+ "Stream" results from solr with
4
+ [cursor-based fetching](https://solr.apache.org/guide/8_6/pagination-of-resultshtml#fetching-a-large-number-of-sorted-results-cursors),
5
+ exposing the stream as a normal ruby enumerator.
6
+
7
+ Note that this is different from true streaming of results via, e.g.,
8
+ the [default `/export` handler](https://solr.apache.org/guide/8_6/exporting-result-sets.html).
9
+ Those queries can involve more complex processing, but are restricted in
10
+ that you
11
+ * can't use relevancy ranking
12
+ * all fields have to be `docValues`.
13
+
14
+ Cursor-based streaming allows, with some restrictions,
15
+ downloading large sets of data without the "deep paging" problems
16
+ associated with just using the `start` and `rows` parameters.
17
+
18
+ The only significant restrictions is that _the sort specification MUST
19
+ include the`uniqueKey` field_. If you're just downloading a whole dataset and
20
+ don't care about order, the default query of `*:*` and the default sort of `id asc`
21
+ will be fine (assuming your uniqueKey is `id`). If you want to sort by
22
+ another field/value, you must use the uniqueKey in a secondary sort (e.g.,
23
+ `sort: "score desc, id asc"`) to guarantee a stable sort.
24
+
25
+ NOTE that if you don't need the `score` (relevancy) field,
26
+ _use the default query parameter of `*:*`_ so
27
+ solr doesn't have to work as hard. Just put your restrictions in the
28
+ `filters` array.
29
+
30
+ ## Usage
31
+
32
+ ```ruby
33
+ require 'solr/cursorstream'
34
+
35
+ core_url = "http://my.solr.com:8025/solr/mycore/"
36
+
37
+ # Get everything in the solr core, no restrictions
38
+ cs = Solr::CursorStream.new(url: core_url)
39
+ cs.each {|doc| ... }
40
+
41
+ # Filter for newer stuff
42
+ # Note that you need to lucene-escape any q/fq values on your own, since
43
+ # otherwise we'd need a full solr syntax parser to determine which
44
+ # bits to escape.
45
+ cs = Solr::CursorStream.new(url: core_url, filters = ['year:{2010 TO *}'])
46
+
47
+ # Find everything with the phrase "Civil War" in the title and
48
+ # pre-20th century, ordered by year
49
+ cs = Solr::CursorStream.new(url: core_url) do |s|
50
+ s.filters = ['year:[* TO 1900]', 'title:"Civil War"']
51
+ s.sort = 'year asc, id asc' # need to include the uniqueKey field (id)!
52
+ end
53
+
54
+ # #each yields a solr document hash until it runs out
55
+ cs.each {|doc| ... }
56
+
57
+ # The underlying Faraday http connection is available if you need
58
+ # to mess with it directly
59
+ cs.connection.set_basic_auth(user, password)
60
+
61
+ # There are a _lot_ of possible arguments to `new`. It may be easier
62
+ # to specify values in a block
63
+ cs = Solr::CursorStream.new(url: core_url) do |s|
64
+ s.batch_size = 100
65
+ s.fields = %w[id title author year]
66
+ s.filters = ["year:[* TO 1900]"]
67
+ s.query = "title:(Civil War)"
68
+ s.sort = 'score desc, id asc'
69
+ end
70
+
71
+ # Get the first 10_000 results from a query
72
+ cs.each_with_index do |doc, i|
73
+ break if i >= 10_000
74
+ do_someting_with_the_solr_doc(doc)
75
+ end
76
+
77
+ ```
78
+
79
+ ## TODO
80
+
81
+ [ ] Add a :limit option
82
+ [ ] Add a `lucene_escape` utility function
83
+ [ ] Change q/fq to take either a string (as current) or a {field => value} hash
84
+ [ ] Actual error handling, or at least passing useful information along
85
+ [ ] Figure out how to test without a live solr to bounce off of. Maybe use
86
+ vcr or similar?
87
+
88
+ ## Installation
89
+
90
+ Add this line to your application's Gemfile:
91
+
92
+ ```ruby
93
+ gem 'solr_cursorstream'
94
+ ```
95
+
96
+ And then execute:
97
+
98
+ $ bundle install
99
+
100
+ Or install it yourself as:
101
+
102
+ $ gem install solr_cursorstream
103
+
104
+
105
+
106
+ ## Contributing
107
+
108
+ Bug reports and pull requests are welcome on GitHub at https://github.com/mlibrary/solr_cursorstream.
109
+
110
+ ## License
111
+
112
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rspec/core/rake_task"
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ require "standard/rake"
9
+
10
+ task default: [:spec, "standard:fix"]
data/bin/console ADDED
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "bundler/setup"
5
+ require "cursorstream"
6
+
7
+ # You can add fixtures and/or initialization code here to make experimenting
8
+ # with your gem easier. You can also use a different console, if you like.
9
+
10
+ # (If you use this, don't forget to add pry to your Gemfile!)
11
+ # require "pry"
12
+ # Pry.start
13
+
14
+ require "irb"
15
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,30 @@
1
+ ## frozen_string_literal: true
2
+
3
+ require "delegate"
4
+
5
+ # Wrapper around a Faraday::Response that provides sugar methods
6
+ # to get solr docs, numFound, and the cursor value
7
+ class Solr::CursorStream::Response < SimpleDelegator
8
+ # @param [Faraday::Response] faraday_response
9
+ def initialize(faraday_response)
10
+ super
11
+ @base_resp = faraday_response
12
+ @resp = faraday_response.body
13
+ __setobj__(@resp)
14
+ end
15
+
16
+ # @return [Array<Hash>] Array of solr documents returned, as simple hashes
17
+ def docs
18
+ @resp["response"]["docs"]
19
+ end
20
+
21
+ # @return [Integer] Number of documents found for the solr query
22
+ def num_found
23
+ @resp["response"]["numFound"]
24
+ end
25
+
26
+ # @return [String] value of the cursor as returned from solr
27
+ def cursor
28
+ @resp["nextCursorMark"]
29
+ end
30
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Solr
4
+ class CursorStream
5
+ VERSION = "0.1.0"
6
+ end
7
+ end
@@ -0,0 +1,127 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "solr/cursorstream/version"
4
+ require "solr/cursorstream/response"
5
+ require "faraday"
6
+ require "faraday/retry"
7
+
8
+ module Solr
9
+ # Fetch results from a solr filter query via solr's cursor streaming.
10
+ # https://solr.apache.org/guide/8_6/pagination-of-results.html#fetching-a-large-number-of-sorted-results-cursors
11
+ #
12
+ # Note that accessors for things like query, filters, etc. are made available for ease of configuration _only_.
13
+ # Changing anything in the middle of a job will screw up the cursors and leave things undetermined. Just
14
+ # make another CursorStream object.
15
+ class CursorStream
16
+ include Enumerable
17
+
18
+ class Error < StandardError; end
19
+
20
+ attr_accessor :url, :query, :handler, :filters, :sort, :batch_size, :fields, :logger
21
+
22
+ # @param [String] url URL to the solr _core_ (e.g., http://my.machine.com/solr/mycore)
23
+ # @param [String] handler The specific handler to target.
24
+ # @param [Array<String>] filters Array of filter queries to apply.
25
+ # @param [String] sort A valid solr sort string. MUST include the unique field (as per solr docs)
26
+ # @param [Integer] batch_size How many results to fetch at a time (for efficiency)
27
+ # @param [Array<String>] fields The solr fields to return.
28
+ # @param [Logger, #info] A logger or logger-like object. When set to `nil` will not do any logging.
29
+ # @param [Symbol] adapter A valid Faraday adapter. If not using the default httpx, it is up to the
30
+ # programmer to do whatever `require` calls are necessary.
31
+ def initialize(url:, handler: "select", query: "*:*", filters: ["*:*"], sort: "id asc", batch_size: 100, fields: [], logger: nil, adapter: :httpx)
32
+ @url = url.gsub(/\/\Z/, "")
33
+ @query = query
34
+ @handler = handler
35
+ @filters = filters
36
+ @sort = sort
37
+ @batch_size = batch_size
38
+ @fields = fields
39
+ @logger = logger
40
+ @adapter = adapter
41
+
42
+ @current_cursor = "*"
43
+ yield self if block_given?
44
+ end
45
+
46
+ # @return String solr url build from the passed url and the handler
47
+ def solr_url
48
+ url + "/" + handler
49
+ end
50
+
51
+ # Iterate through the documents in the stream. Behind the scenes, these will be fetched in batches
52
+ # of `batch_size` for efficiency.
53
+ # @yieldreturn [Hash] A single solr document from the stream
54
+ def each
55
+ return enum_for(:each) unless block_given?
56
+ verify_we_have_everything!
57
+ while solr_has_more?
58
+ cursor_response = get_page
59
+ cursor_response.docs.each { |d| yield d }
60
+ end
61
+ end
62
+
63
+ # Build up a Faraday connection
64
+ # @param [Symbol] adapter Which faraday adapter to use. If not :httpx, you must have loaded the
65
+ # necessary adapter already.
66
+ # @return [Faraday::Connection] A faraday connection object.
67
+ def self.connection(adapter: :httpx)
68
+ require "httpx/adapters/faraday" if adapter == :httpx
69
+ Faraday.new(request: {params_encoder: Faraday::FlatParamsEncoder}) do |builder|
70
+ builder.use Faraday::Response::RaiseError
71
+ builder.request :url_encoded
72
+ builder.request :retry
73
+ builder.response :json
74
+ builder.adapter @adapter
75
+ end
76
+ end
77
+
78
+ # @see CursorStream.connection
79
+ def connection(adapter: @adapter)
80
+ return @connection if @connection
81
+ @connection = self.class.connection(adapter: @adapter)
82
+ end
83
+
84
+ # @private
85
+ # Get a single "page" (`batch_size` documents) from solr. Feeds into #each
86
+ # @return [CursorResponse]
87
+ def get_page
88
+ params = {cursorMark: @current_cursor}.merge default_params
89
+ r = connection.get(solr_url, params)
90
+ resp = Response.new(r)
91
+ @last_cursor = @current_cursor
92
+ @current_cursor = resp.cursor
93
+ resp
94
+ end
95
+
96
+ # @private
97
+ # @return [Hash] Default solr params derived from instance variables
98
+ def default_params
99
+ field_list = Array(fields).join(",")
100
+ p = {q: @query, wt: :json, rows: batch_size, sort: @sort, fq: filters, fl: field_list}
101
+ p.reject { |_k, v| [nil, "", []].include?(v) }
102
+ p
103
+ end
104
+
105
+ # @private
106
+ # Make sure we have everything we need for a successful stream
107
+ def verify_we_have_everything!
108
+ missing = {handler: @handler, filters: @filters, batch_size: @batch_size}.select { |_k, v| v.nil? }.keys
109
+ raise Error.new("Solr::CursorStreamer missing value for #{missing.join(", ")}") unless missing.empty?
110
+ end
111
+
112
+ # @private
113
+ # Determine if solr has another page of results
114
+ # @return [Boolean]
115
+ def solr_has_more?
116
+ @last_cursor != @current_cursor
117
+ end
118
+
119
+ # @private
120
+ # @return Lambda that runs every time the connection needs to retry due to http error
121
+ def http_request_retry_block
122
+ ->(env:, options:, retries_remaining:, exception:, will_retry_in:) do
123
+ # TODO: log that a retry happened
124
+ end
125
+ end
126
+ end
127
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/solr/cursorstream/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "solr_cursorstream"
7
+ spec.version = Solr::CursorStream::VERSION
8
+ spec.authors = ["Bill Dueber"]
9
+ spec.email = ["bill@dueber.com"]
10
+
11
+ spec.summary = "Get an iterator on a solr filter using stream/cursor"
12
+ spec.homepage = "https://github.com/mlibrary/solr_cursorstream"
13
+ spec.license = "MIT"
14
+
15
+ spec.metadata["homepage_uri"] = spec.homepage
16
+ spec.metadata["source_code_uri"] = spec.homepage
17
+ spec.metadata["changelog_uri"] = spec.homepage + "/CHANGELOG.md"
18
+
19
+ # Specify which files should be added to the gem when it is released.
20
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
21
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
22
+ `git ls-files -z`.split("\x0").reject do |f|
23
+ (f == __FILE__) || f.match(%r{\A(?:(?:test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
24
+ end
25
+ end
26
+ spec.bindir = "exe"
27
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
28
+ spec.require_paths = ["lib"]
29
+
30
+ spec.add_dependency "faraday"
31
+ spec.add_dependency "faraday-retry"
32
+ spec.add_dependency "httpx"
33
+ spec.add_dependency "milemarker"
34
+
35
+ spec.add_development_dependency "pry"
36
+ spec.add_development_dependency "rake", "~> 13.0"
37
+ spec.add_development_dependency "rspec", "~> 3.0"
38
+ spec.add_development_dependency "standard"
39
+ end
metadata ADDED
@@ -0,0 +1,171 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: solr_cursorstream
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Bill Dueber
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2022-06-21 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: faraday
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: faraday-retry
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: httpx
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: milemarker
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: pry
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rake
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '13.0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '13.0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rspec
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '3.0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '3.0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: standard
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ description:
126
+ email:
127
+ - bill@dueber.com
128
+ executables: []
129
+ extensions: []
130
+ extra_rdoc_files: []
131
+ files:
132
+ - ".rspec"
133
+ - ".rubocop.yml"
134
+ - CHANGELOG.md
135
+ - Gemfile
136
+ - LICENSE.txt
137
+ - README.md
138
+ - Rakefile
139
+ - bin/console
140
+ - bin/setup
141
+ - lib/solr/cursorstream.rb
142
+ - lib/solr/cursorstream/response.rb
143
+ - lib/solr/cursorstream/version.rb
144
+ - solr_cursorstream.gemspec
145
+ homepage: https://github.com/mlibrary/solr_cursorstream
146
+ licenses:
147
+ - MIT
148
+ metadata:
149
+ homepage_uri: https://github.com/mlibrary/solr_cursorstream
150
+ source_code_uri: https://github.com/mlibrary/solr_cursorstream
151
+ changelog_uri: https://github.com/mlibrary/solr_cursorstream/CHANGELOG.md
152
+ post_install_message:
153
+ rdoc_options: []
154
+ require_paths:
155
+ - lib
156
+ required_ruby_version: !ruby/object:Gem::Requirement
157
+ requirements:
158
+ - - ">="
159
+ - !ruby/object:Gem::Version
160
+ version: '0'
161
+ required_rubygems_version: !ruby/object:Gem::Requirement
162
+ requirements:
163
+ - - ">="
164
+ - !ruby/object:Gem::Version
165
+ version: '0'
166
+ requirements: []
167
+ rubygems_version: 3.1.2
168
+ signing_key:
169
+ specification_version: 4
170
+ summary: Get an iterator on a solr filter using stream/cursor
171
+ test_files: []