s3arch 0.0.1 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1d5bd8c8d47da22b8feb308ff8fbc96205d554ae915ef8e4e2831d5ba534f786
4
- data.tar.gz: 4741157e9c522074a08c3478dc4cfddd16445590cfdb107100121083ff34a29a
3
+ metadata.gz: cd57db198f9598324792e16e5b84e9c22914ebfdfd4fddeb09fc99641458322c
4
+ data.tar.gz: e107d35cd4b7084d3cf90e3d014bb4fe3c6089475b26419524d9132867565592
5
5
  SHA512:
6
- metadata.gz: 54b5a57df4c767d23eac10d7984ea5ec796f60cfe9047b24619cb3d67600af2b67e71e2f0639704a55939a057f7351f7778a0587b9db6b80fa768eb2398c9e18
7
- data.tar.gz: c4a2ff82ac4985b24376ccacab235e9a56132e1ee10b77202eff9defc62d0b1765ab9da31d89c695b42c3560984d1098df476a6b73b6a4561ad6c1cbefaa4749
6
+ metadata.gz: 65aba03e3064bf597dc8bb4e53d7552c07f0b89ebb7ca24b9e6c74a7f5c22f9c95746cec237d1e100fce63cb8f285f782487300abd7f4d1b6e1c76553fdad5ff
7
+ data.tar.gz: 28f5d435ce384c0c1c8e05d17d478ad9fdb9e0a949506350384886fdfcedbfaa2cbd9308dfd08d2faf91dfe4bf02ac07ac6d6749470143914a977f29ab0aef04
checksums.yaml.gz.sig ADDED
Binary file
data/CHANGELOG.md CHANGED
@@ -1,5 +1,21 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.0.2] - 2025-06-08
4
+
5
+ ### Added
6
+
7
+ - `S3arch::Tokenizer` — pure tokenization class for pre-computing search tokens
8
+ - Token-based indexing — indexer reads pre-computed tokens, never raw content
9
+ - Incremental updates via `Indexer#apply_changes` (DynamoDB Stream INSERT/MODIFY/REMOVE)
10
+ - `Indexer#process_event` — processes SQS events containing DynamoDB stream records
11
+ - `Configuration#token_field` — configurable DynamoDB attribute for stored tokens
12
+ - `Configuration#searchable_fields` — configurable list of fields to tokenize
13
+
14
+ ### Changed
15
+
16
+ - `S3arch::Indexer` rewritten to support both full rebuild and incremental updates
17
+ - FTS5 contentless DELETE support using stored token values from stream OLD_IMAGE
18
+
3
19
  ## [0.0.1] - 2025-06-08
4
20
 
5
21
  ### Added
@@ -0,0 +1,26 @@
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIEdDCCAtygAwIBAgIBATANBgkqhkiG9w0BAQsFADBAMQ4wDAYDVQQDDAVhZ2Vu
3
+ dDEZMBcGCgmSJomT8ixkARkWCXN0b3d6aWxsYTETMBEGCgmSJomT8ixkARkWA2Nv
4
+ bTAeFw0yNjA2MDgxOTExNTlaFw0yNzA2MDgxOTExNTlaMEAxDjAMBgNVBAMMBWFn
5
+ ZW50MRkwFwYKCZImiZPyLGQBGRYJc3Rvd3ppbGxhMRMwEQYKCZImiZPyLGQBGRYD
6
+ Y29tMIIBojANBgkqhkiG9w0BAQEFAAOCAY8AMIIBigKCAYEAupBquKI/4WvXOgND
7
+ pXyqH2GllZs1wG4TWWdn/DoMg45UoCwD+AWEuGrIdInBCpPN8vEJNJWPoM/RrU+b
8
+ xRBZT4uUk00bnZRW2SYh5GJSqBoBR+rWc2DGkXyGfdRU2sQvkB0+is6ChgQ61WMM
9
+ 33LE9+loBlVsZ6EVtrc18Uh2OW0mJpe0hN2nmBrxZqqOZigxC4DKRMFHvpRkxSb6
10
+ mD4kit1AcwX9NEWJsXxrPaetL/SB/VbXaEZX93XAvp6USaXvCWt4slkDS2mIvqtn
11
+ 9DtGC43LFC7SDGbnsG9PVenQgVCi8UWFPUAab0PqZSlmi3Qlbhw8qTGPp5Cbv4vz
12
+ qjC2UGPOQigA/7lbbGRhCohMrjOVHMAQwkcgiIqtolUoYlnvPMIy+m3pdvgDv/PH
13
+ bsZGvXQ7i0458xsmp1vaKthZocVAR+GboHbuIiYPUnO45ccXUQ00x6365tTe7mZi
14
+ NvmUYdAGbQmVvFqyxF7IYA6sF74L2Lstu0knSfss557bAe1HAgMBAAGjeTB3MAkG
15
+ A1UdEwQCMAAwCwYDVR0PBAQDAgSwMB0GA1UdDgQWBBSnxTL/lNBCeLqpeVIX6AUY
16
+ kel4zjAeBgNVHREEFzAVgRNhZ2VudEBzdG93emlsbGEuY29tMB4GA1UdEgQXMBWB
17
+ E2FnZW50QHN0b3d6aWxsYS5jb20wDQYJKoZIhvcNAQELBQADggGBACm9Fjit/UCv
18
+ FxlKqeiCTIG94cIx+QrWAOJSx9knKydwUec1u04D/DbfZjTn3C2Bj227QgxeUn+6
19
+ if3e2v7zAk1896hLmGYzML0+nxQPb0vmtdLR7HETUlSKTVabcv1fbwLyjsuGrBvk
20
+ y51vOEzUEZ508a9yepLYqrQu1kOju4d57c9oA5l3H0mMKWz7av9tFj0B+STvuaWk
21
+ HRYDWc5HgOEVTyV+w0uFt2Kw4OCb8C42uSvC5RfYYtw78MSP+5Ru+LXJ7XOtmuN0
22
+ E6GVmofQ17ig9O3rgfFbMendSInrRmvPIGswvM1yivq9NOllFbdck2OJKPx6FCJF
23
+ 7SJIkXQfc9P4B5iASIV1d1FsE0YX+g3jHXPJK/4mGL5bAyBKzpMfQB/mg6vQBzkh
24
+ aOKPwcreFj7TznBl89R5tNS9wZQfPVR98zgPyocddWhK18eQNMSBUnv4eeJ8PPbk
25
+ DovL+G8ajHDZ9fjH/+GVYHEMuiVdLarXrKJpHC1VfGTTUAp4NSEpUQ==
26
+ -----END CERTIFICATE-----
@@ -2,16 +2,50 @@
2
2
 
3
3
  module S3arch
4
4
  class Configuration
5
- attr_accessor :source_table, :source_index, :owner_key, :index_bucket, :version_table,
6
- :searchable_fields, :metadata_fields, :record_filter, :owner_extractor,
7
- :logger, :version_ttl, :max_results, :max_cached_dbs, :ephemeral_storage_mb
5
+ # DynamoDB table that contains the source records to index
6
+ attr_accessor :source_table
7
+
8
+ # DynamoDB index to query records by owner (e.g., 'UserIndex')
9
+ attr_accessor :source_index
10
+
11
+ # Partition key field on the source table for owner lookup
12
+ attr_accessor :owner_key
13
+
14
+ # S3 bucket for storing SQLite index files
15
+ attr_accessor :index_bucket
16
+
17
+ # DynamoDB table for version tracking
18
+ attr_accessor :version_table
19
+
20
+ # FTS5 searchable fields — array of field names from the source record
21
+ attr_accessor :searchable_fields
22
+
23
+ # DynamoDB attribute name where pre-computed tokens are stored (Map type)
24
+ # e.g., { "searchTokens": { "name": "blue jacket", "description": "warm winter coat" } }
25
+ attr_accessor :token_field
26
+
27
+ # Metadata fields stored alongside FTS5 for filtering (not searched)
28
+ attr_accessor :metadata_fields
29
+
30
+ # Filter proc — receives a record hash, returns true to include in index
31
+ attr_accessor :record_filter
32
+
33
+ # Owner extractor — proc that extracts owner_id from a DynamoDB stream record
34
+ attr_accessor :owner_extractor
35
+
36
+ # Logger (defaults to $stdout)
37
+ attr_accessor :logger
38
+
39
+ # Searcher settings
40
+ attr_accessor :version_ttl, :max_results, :max_cached_dbs, :ephemeral_storage_mb
8
41
 
9
42
  def initialize
10
43
  @owner_key = 'user_id'
11
44
  @searchable_fields = %w[name description]
45
+ @token_field = 'searchTokens'
12
46
  @metadata_fields = %w[status created_at]
13
47
  @record_filter = ->(_record) { true }
14
- @owner_extractor = ->(stream_record) {
48
+ @owner_extractor = lambda { |stream_record|
15
49
  image = stream_record.dig('dynamodb', 'NewImage') || stream_record.dig('dynamodb', 'OldImage') || {}
16
50
  image.dig(owner_key, 'S')
17
51
  }
@@ -22,11 +56,12 @@ module S3arch
22
56
  @logger = nil
23
57
  end
24
58
 
59
+ # Convenience: env-based configuration (reads from Lambda environment variables)
25
60
  def from_env!
26
- @source_table = ENV['S3ARCH_SOURCE_TABLE']
61
+ @source_table = ENV['S3ARCH_SOURCE_TABLE'] || ENV.fetch('INVENTORY_TABLE', nil)
27
62
  @source_index = ENV['S3ARCH_SOURCE_INDEX'] || 'UserIndex'
28
- @index_bucket = ENV['S3ARCH_INDEX_BUCKET']
29
- @version_table = ENV['S3ARCH_VERSION_TABLE']
63
+ @index_bucket = ENV['S3ARCH_INDEX_BUCKET'] || ENV.fetch('SEARCH_INDEX_BUCKET', nil)
64
+ @version_table = ENV['S3ARCH_VERSION_TABLE'] || ENV.fetch('SEARCH_INDEX_TABLE', nil)
30
65
  self
31
66
  end
32
67
 
@@ -6,6 +6,9 @@ require 'sqlite3'
6
6
  require 'json'
7
7
 
8
8
  module S3arch
9
+ # Builds SQLite FTS5 databases per owner from pre-computed tokens stored in DynamoDB.
10
+ # The indexer never sees raw content — only tokens. Supports incremental updates via
11
+ # DynamoDB Stream events (INSERT/MODIFY/REMOVE).
9
12
  class Indexer
10
13
  def initialize(config: S3arch.configuration)
11
14
  config.validate!
@@ -14,6 +17,8 @@ module S3arch
14
17
  @s3 = Aws::S3::Client.new
15
18
  end
16
19
 
20
+ # Full rebuild — pulls all tokens from DynamoDB for an owner.
21
+ # Used for initial backfill or when incremental isn't possible.
17
22
  def rebuild(owner_id)
18
23
  records = fetch_records(owner_id)
19
24
  db_path = "/tmp/s3arch_#{owner_id}.sqlite3"
@@ -27,38 +32,255 @@ module S3arch
27
32
  File.delete(db_path) if db_path && File.exist?(db_path)
28
33
  end
29
34
 
35
+ # Incremental update — applies INSERT/DELETE/UPDATE to an existing index.
36
+ # Downloads current DB from S3, applies changes, re-uploads.
37
+ def apply_changes(owner_id, changes)
38
+ db_path = "/tmp/s3arch_#{owner_id}.sqlite3"
39
+ download_existing(owner_id, db_path)
40
+
41
+ unless File.exist?(db_path)
42
+ log(:info, 'No existing index, doing full rebuild', owner_id: owner_id)
43
+ return rebuild(owner_id)
44
+ end
45
+
46
+ db = SQLite3::Database.new(db_path)
47
+ db.results_as_hash = true
48
+
49
+ db.transaction do
50
+ changes.each { |change| apply_change(db, change) }
51
+ end
52
+
53
+ record_count = db.get_first_value('SELECT COUNT(*) FROM records_meta')
54
+ db.close
55
+
56
+ upload(owner_id, db_path)
57
+ increment_version(owner_id, record_count)
58
+
59
+ log(:info, 'Index updated incrementally', owner_id: owner_id, changes: changes.size, record_count: record_count)
60
+ ensure
61
+ File.delete(db_path) if db_path && File.exist?(db_path)
62
+ end
63
+
64
+ # Process SQS event containing DynamoDB stream records.
65
+ # Groups by owner and applies incremental changes.
30
66
  def process_event(event)
31
- records = event['Records'] || []
32
- owner_ids = records.filter_map { |r|
33
- body = JSON.parse(r['body'])
34
- @config.owner_extractor.call(body)
35
- }.uniq
67
+ sqs_records = event['Records'] || []
68
+ grouped = group_changes(sqs_records)
36
69
 
37
- log(:info, 'Rebuilding indexes', owner_ids: owner_ids, record_count: records.size)
38
- owner_ids.each { |id| rebuild(id) }
39
- { statusCode: 200, body: JSON.generate(rebuilt: owner_ids.size) }
70
+ log(:info, 'Processing stream events', owner_count: grouped.size, record_count: sqs_records.size)
71
+
72
+ grouped.each do |owner_id, changes|
73
+ if changes.any? { |c| c[:action] == :rebuild }
74
+ rebuild(owner_id)
75
+ else
76
+ apply_changes(owner_id, changes)
77
+ end
78
+ end
79
+
80
+ { statusCode: 200, body: JSON.generate(rebuilt: grouped.size) }
40
81
  end
41
82
 
42
83
  private
43
84
 
85
+ def group_changes(sqs_records)
86
+ grouped = Hash.new { |h, k| h[k] = [] }
87
+
88
+ sqs_records.each do |sqs_record|
89
+ stream_record = JSON.parse(sqs_record['body'])
90
+ event_name = stream_record['eventName']
91
+ new_image = stream_record.dig('dynamodb', 'NewImage')
92
+ old_image = stream_record.dig('dynamodb', 'OldImage')
93
+
94
+ owner_id = extract_owner(new_image || old_image)
95
+ next unless owner_id
96
+
97
+ change = build_change(event_name, new_image, old_image)
98
+ grouped[owner_id] << change if change
99
+ end
100
+
101
+ grouped
102
+ end
103
+
104
+ def build_change(event_name, new_image, old_image)
105
+ case event_name
106
+ when 'INSERT'
107
+ tokens = extract_tokens(new_image)
108
+ record_id = extract_record_id(new_image)
109
+ return nil unless tokens && record_id && passes_filter?(new_image)
110
+
111
+ { action: :insert, record_id: record_id, tokens: tokens, meta: extract_meta(new_image) }
112
+ when 'REMOVE'
113
+ tokens = extract_tokens(old_image)
114
+ record_id = extract_record_id(old_image)
115
+ return nil unless tokens && record_id
116
+
117
+ { action: :delete, record_id: record_id, tokens: tokens }
118
+ when 'MODIFY'
119
+ old_tokens = extract_tokens(old_image)
120
+ new_tokens = extract_tokens(new_image)
121
+ record_id = extract_record_id(new_image)
122
+ return nil unless record_id
123
+
124
+ # If item no longer passes filter, treat as delete
125
+ unless passes_filter?(new_image)
126
+ return old_tokens ? { action: :delete, record_id: record_id, tokens: old_tokens } : nil
127
+ end
128
+
129
+ # If item previously didn't pass filter (no old tokens), treat as insert
130
+ unless old_tokens
131
+ return new_tokens ? { action: :insert, record_id: record_id, tokens: new_tokens,
132
+ meta: extract_meta(new_image) } : nil
133
+ end
134
+
135
+ return nil unless new_tokens
136
+
137
+ { action: :update, record_id: record_id, old_tokens: old_tokens, new_tokens: new_tokens,
138
+ meta: extract_meta(new_image) }
139
+ end
140
+ end
141
+
142
+ def extract_owner(image)
143
+ return nil unless image
144
+
145
+ val = image[@config.owner_key]
146
+ val.is_a?(Hash) ? val['S'] : val
147
+ end
148
+
149
+ def extract_record_id(image)
150
+ return nil unless image
151
+
152
+ val = image['id']
153
+ val.is_a?(Hash) ? val['S'] : val
154
+ end
155
+
156
+ def extract_tokens(image)
157
+ return nil unless image
158
+
159
+ val = image[@config.token_field]
160
+ return nil unless val
161
+
162
+ # Token field is a DynamoDB Map: { "M": { "name": { "S": "..." }, "description": { "S": "..." } } }
163
+ if val.is_a?(Hash) && val.key?('M')
164
+ val['M'].transform_values { |v| v.is_a?(Hash) ? (v['S'] || '') : v.to_s }
165
+ elsif val.is_a?(Hash) && !val.key?('S')
166
+ val.transform_values { |v| v.is_a?(Hash) ? (v['S'] || '') : v.to_s }
167
+ end
168
+ end
169
+
170
+ def extract_meta(image)
171
+ @config.metadata_fields.each_with_object({}) do |field, meta|
172
+ val = image[field]
173
+ meta[field] = val.is_a?(Hash) ? (val['S'] || val['N'] || '') : val.to_s
174
+ end
175
+ end
176
+
177
+ def passes_filter?(image)
178
+ # Convert DynamoDB image to plain hash for filter
179
+ plain = image.transform_values { |v| v.is_a?(Hash) ? (v['S'] || v['N'] || v['BOOL']&.to_s || '') : v }
180
+ @config.record_filter.call(plain)
181
+ end
182
+
183
+ def apply_change(db, change)
184
+ case change[:action]
185
+ when :insert
186
+ rowid = next_rowid(db)
187
+ insert_row(db, rowid, change[:record_id], change[:tokens], change[:meta])
188
+ when :delete
189
+ rowid = find_rowid(db, change[:record_id])
190
+ delete_row(db, rowid, change[:tokens]) if rowid
191
+ when :update
192
+ rowid = find_rowid(db, change[:record_id])
193
+ if rowid
194
+ delete_row(db, rowid, change[:old_tokens])
195
+ insert_row(db, rowid, change[:record_id], change[:new_tokens], change[:meta])
196
+ else
197
+ # Record wasn't in index yet, just insert
198
+ new_rowid = next_rowid(db)
199
+ insert_row(db, new_rowid, change[:record_id], change[:new_tokens], change[:meta])
200
+ end
201
+ end
202
+ end
203
+
204
+ def insert_row(db, rowid, record_id, tokens, meta)
205
+ fts_cols = @config.searchable_fields
206
+ fts_values = fts_cols.map { |f| tokens[f] || '' }
207
+ placeholders = (['?'] * (fts_values.size + 1)).join(', ')
208
+ db.execute("INSERT INTO records_fts(rowid, #{fts_cols.join(', ')}) VALUES (#{placeholders})",
209
+ [rowid] + fts_values)
210
+
211
+ meta ||= {}
212
+ meta_values = [rowid, record_id] + @config.metadata_fields.map { |f| meta[f] || '' }
213
+ meta_placeholders = (['?'] * meta_values.size).join(', ')
214
+ meta_cols = "rowid, record_id, #{@config.metadata_fields.join(', ')}"
215
+ db.execute("INSERT INTO records_meta(#{meta_cols}) VALUES (#{meta_placeholders})", meta_values)
216
+ end
217
+
218
+ def delete_row(db, rowid, tokens)
219
+ fts_cols = @config.searchable_fields
220
+ fts_values = fts_cols.map { |f| tokens[f] || '' }
221
+ placeholders = (['?'] * (fts_values.size + 1)).join(', ')
222
+ # FTS5 contentless delete: INSERT with special 'delete' command
223
+ fts_delete_sql = "INSERT INTO records_fts(records_fts, rowid, #{fts_cols.join(', ')}) " \
224
+ "VALUES ('delete', #{placeholders})"
225
+ db.execute(fts_delete_sql, [rowid] + fts_values)
226
+ db.execute('DELETE FROM records_meta WHERE rowid = ?', [rowid])
227
+ end
228
+
229
+ def find_rowid(db, record_id)
230
+ db.get_first_value('SELECT rowid FROM records_meta WHERE record_id = ?', [record_id])
231
+ end
232
+
233
+ def next_rowid(db)
234
+ max = db.get_first_value('SELECT MAX(rowid) FROM records_meta')
235
+ (max || 0) + 1
236
+ end
237
+
238
+ def download_existing(owner_id, db_path)
239
+ @s3.get_object(bucket: @config.index_bucket, key: "#{owner_id}/index.sqlite3", response_target: db_path)
240
+ rescue Aws::S3::Errors::NoSuchKey
241
+ # No existing index — caller will fall back to rebuild
242
+ end
243
+
244
+ # Full rebuild: fetches token field from DynamoDB (never reads content)
44
245
  def fetch_records(owner_id)
45
246
  records = []
247
+ fields_to_project = ['id', @config.token_field, @config.owner_key] + @config.metadata_fields + filter_fields
46
248
  params = { table_name: @config.source_table, index_name: @config.source_index,
47
249
  key_condition_expression: "#{@config.owner_key} = :owner",
48
- expression_attribute_values: { ':owner' => owner_id } }
250
+ expression_attribute_values: { ':owner' => owner_id },
251
+ projection_expression: fields_to_project.uniq.join(', ') }
49
252
 
50
253
  loop do
51
254
  result = @dynamodb.query(params)
52
- result.items.each { |item| records << item if @config.record_filter.call(item) }
255
+ result.items.each do |item|
256
+ next unless @config.record_filter.call(item)
257
+
258
+ tokens = item[@config.token_field]
259
+ next unless tokens.is_a?(Hash) && tokens.any?
260
+
261
+ records << { 'id' => item['id'], 'tokens' => tokens, 'meta' => extract_meta_from_item(item) }
262
+ end
53
263
  break unless result.last_evaluated_key
264
+
54
265
  params[:exclusive_start_key] = result.last_evaluated_key
55
266
  end
56
267
 
57
268
  records
58
269
  end
59
270
 
271
+ def extract_meta_from_item(item)
272
+ @config.metadata_fields.to_h do |field|
273
+ [field, item[field].to_s]
274
+ end
275
+ end
276
+
277
+ def filter_fields
278
+ # Fields needed by the record_filter (best-effort — add status/bin_id for default filter)
279
+ %w[status bin_id]
280
+ end
281
+
60
282
  def build_database(db_path, records)
61
- File.delete(db_path) if File.exist?(db_path)
283
+ FileUtils.rm_f(db_path)
62
284
  db = SQLite3::Database.new(db_path)
63
285
 
64
286
  fts_cols = @config.searchable_fields.join(', ')
@@ -73,35 +295,21 @@ module S3arch
73
295
  db.transaction do
74
296
  records.each_with_index do |record, idx|
75
297
  rowid = idx + 1
76
- fts_values = @config.searchable_fields.map { |f| normalize_field(record[f]) }
77
- db.execute("INSERT INTO records_fts(rowid, #{fts_cols}) VALUES (#{(['?'] * (fts_values.size + 1)).join(', ')})",
78
- [rowid] + fts_values)
79
-
80
- meta_values = [rowid, normalize_field(record['id'])] + @config.metadata_fields.map { |f| normalize_field(record[f]) }
81
- placeholders = (['?'] * meta_values.size).join(', ')
82
- db.execute("INSERT INTO records_meta(rowid, record_id, #{@config.metadata_fields.join(', ')}) VALUES (#{placeholders})",
83
- meta_values)
298
+ tokens = record['tokens']
299
+ fts_values = @config.searchable_fields.map { |f| tokens[f] || '' }
300
+ placeholders = (['?'] * (fts_values.size + 1)).join(', ')
301
+ db.execute("INSERT INTO records_fts(rowid, #{fts_cols}) VALUES (#{placeholders})", [rowid] + fts_values)
302
+
303
+ meta_values = [rowid, record['id']] + @config.metadata_fields.map { |f| record['meta'][f] || '' }
304
+ meta_placeholders = (['?'] * meta_values.size).join(', ')
305
+ meta_cols = "rowid, record_id, #{@config.metadata_fields.join(', ')}"
306
+ db.execute("INSERT INTO records_meta(#{meta_cols}) VALUES (#{meta_placeholders})", meta_values)
84
307
  end
85
308
  end
86
309
 
87
310
  db.close
88
311
  end
89
312
 
90
- def normalize_field(value)
91
- case value
92
- when Hash
93
- if value.key?('S') then value['S'].to_s
94
- elsif value.key?('L') then value['L'].map { |v| v['S'] || v.to_s }.join(' ')
95
- elsif value.key?('N') then value['N'].to_s
96
- elsif value.key?('SS') then value['SS'].join(' ')
97
- else value.values.first.to_s
98
- end
99
- when Array then value.map { |v| v.is_a?(Hash) ? (v['S'] || v.to_s) : v.to_s }.join(' ')
100
- when nil then ''
101
- else value.to_s
102
- end
103
- end
104
-
105
313
  def upload(owner_id, db_path)
106
314
  @s3.put_object(bucket: @config.index_bucket, key: "#{owner_id}/index.sqlite3", body: File.open(db_path, 'rb'))
107
315
  end
@@ -110,13 +318,15 @@ module S3arch
110
318
  @dynamodb.update_item(
111
319
  table_name: @config.version_table,
112
320
  key: { @config.owner_key => owner_id },
113
- update_expression: 'SET version = if_not_exists(version, :zero) + :one, updated_at = :now, record_count = :count',
321
+ update_expression: 'SET version = if_not_exists(version, :zero) + :one, ' \
322
+ 'updated_at = :now, record_count = :count',
114
323
  expression_attribute_values: { ':zero' => 0, ':one' => 1, ':now' => Time.now.iso8601, ':count' => record_count }
115
324
  )
116
325
  end
117
326
 
118
327
  def log(level, message, **data)
119
328
  return unless @config.logger
329
+
120
330
  @config.logger.send(level, message, **data)
121
331
  end
122
332
  end
@@ -72,9 +72,7 @@ module S3arch
72
72
 
73
73
  def fetch_version(owner_id)
74
74
  cached = self.class.version_cache[owner_id]
75
- if cached && (Time.now - cached[:checked_at]) < @config.version_ttl
76
- return cached[:version]
77
- end
75
+ return cached[:version] if cached && (Time.now - cached[:checked_at]) < @config.version_ttl
78
76
 
79
77
  result = @dynamodb.get_item(table_name: @config.version_table,
80
78
  key: { @config.owner_key => owner_id },
@@ -112,6 +110,7 @@ module S3arch
112
110
  rows = db.execute(sql, [match_expr])
113
111
  rows.filter_map do |row|
114
112
  next if filters.any? { |field, value| row[field.to_s] != value }
113
+
115
114
  { record_id: row['record_id'], rank: row['rank'] }
116
115
  end
117
116
  rescue SQLite3::Exception => e
@@ -131,6 +130,7 @@ module S3arch
131
130
 
132
131
  def log(level, message, **data)
133
132
  return unless @config.logger
133
+
134
134
  @config.logger.send(level, message, **data)
135
135
  end
136
136
  end
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ module S3arch
4
+ # Pure tokenization — generates the token string stored in DynamoDB.
5
+ # At query time, FTS5 tokenizes the same way internally, so prefix matches work.
6
+ class Tokenizer
7
+ def initialize(fields: S3arch.configuration.searchable_fields)
8
+ @fields = fields
9
+ end
10
+
11
+ # Accepts a record hash, returns a hash of { field => tokenized_string }
12
+ # This is what gets stored in DynamoDB and fed directly into FTS5.
13
+ def tokenize(record)
14
+ @fields.to_h do |field|
15
+ [field, normalize(record[field])]
16
+ end
17
+ end
18
+
19
+ # Flattened single-string version for simple storage (all fields concatenated)
20
+ def tokenize_flat(record)
21
+ @fields.map { |f| normalize(record[f]) }.reject(&:empty?).join(' ')
22
+ end
23
+
24
+ private
25
+
26
+ def normalize(value)
27
+ case value
28
+ when Hash
29
+ if value.key?('S') then value['S'].to_s
30
+ elsif value.key?('L') then value['L'].map { |v| v['S'] || v.to_s }.join(' ')
31
+ elsif value.key?('N') then value['N'].to_s
32
+ elsif value.key?('SS') then value['SS'].join(' ')
33
+ else value.values.first.to_s
34
+ end
35
+ when Array then value.map { |v| v.is_a?(Hash) ? (v['S'] || v.to_s) : v.to_s }.join(' ')
36
+ when nil then ''
37
+ else value.to_s
38
+ end
39
+ end
40
+ end
41
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module S3arch
4
- VERSION = '0.0.1'
4
+ VERSION = '0.0.3'
5
5
  end
data/lib/s3arch.rb CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  require_relative 's3arch/version'
4
4
  require_relative 's3arch/configuration'
5
+ require_relative 's3arch/tokenizer'
5
6
  require_relative 's3arch/indexer'
6
7
  require_relative 's3arch/searcher'
7
8
  require_relative 's3arch/handler'
data.tar.gz.sig ADDED
@@ -0,0 +1,2 @@
1
+ %d��Ox+B<,�Jׂ����>: ��P۬��`�3?�N���6��7�7=� ��[�=j�o���#��.eO�ä�k}�fG�m$�|!�5�*�����%. 1�M�a��$��
2
+ �Q.�:��Yo�Ǥvׇ��pI�s�w��i�d� {�ȡ��'q��Ǿz6���xPK���%v?$�\�O�BT��OƭF�Cq�2 �Y$�'�I�з����!Z���KH�F�~�S�Rˁc&8R�����d{U�TZo�:���S��2
metadata CHANGED
@@ -1,12 +1,39 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: s3arch
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Adam Dalton
8
8
  bindir: bin
9
- cert_chain: []
9
+ cert_chain:
10
+ - |
11
+ -----BEGIN CERTIFICATE-----
12
+ MIIEdDCCAtygAwIBAgIBATANBgkqhkiG9w0BAQsFADBAMQ4wDAYDVQQDDAVhZ2Vu
13
+ dDEZMBcGCgmSJomT8ixkARkWCXN0b3d6aWxsYTETMBEGCgmSJomT8ixkARkWA2Nv
14
+ bTAeFw0yNjA2MDgxOTExNTlaFw0yNzA2MDgxOTExNTlaMEAxDjAMBgNVBAMMBWFn
15
+ ZW50MRkwFwYKCZImiZPyLGQBGRYJc3Rvd3ppbGxhMRMwEQYKCZImiZPyLGQBGRYD
16
+ Y29tMIIBojANBgkqhkiG9w0BAQEFAAOCAY8AMIIBigKCAYEAupBquKI/4WvXOgND
17
+ pXyqH2GllZs1wG4TWWdn/DoMg45UoCwD+AWEuGrIdInBCpPN8vEJNJWPoM/RrU+b
18
+ xRBZT4uUk00bnZRW2SYh5GJSqBoBR+rWc2DGkXyGfdRU2sQvkB0+is6ChgQ61WMM
19
+ 33LE9+loBlVsZ6EVtrc18Uh2OW0mJpe0hN2nmBrxZqqOZigxC4DKRMFHvpRkxSb6
20
+ mD4kit1AcwX9NEWJsXxrPaetL/SB/VbXaEZX93XAvp6USaXvCWt4slkDS2mIvqtn
21
+ 9DtGC43LFC7SDGbnsG9PVenQgVCi8UWFPUAab0PqZSlmi3Qlbhw8qTGPp5Cbv4vz
22
+ qjC2UGPOQigA/7lbbGRhCohMrjOVHMAQwkcgiIqtolUoYlnvPMIy+m3pdvgDv/PH
23
+ bsZGvXQ7i0458xsmp1vaKthZocVAR+GboHbuIiYPUnO45ccXUQ00x6365tTe7mZi
24
+ NvmUYdAGbQmVvFqyxF7IYA6sF74L2Lstu0knSfss557bAe1HAgMBAAGjeTB3MAkG
25
+ A1UdEwQCMAAwCwYDVR0PBAQDAgSwMB0GA1UdDgQWBBSnxTL/lNBCeLqpeVIX6AUY
26
+ kel4zjAeBgNVHREEFzAVgRNhZ2VudEBzdG93emlsbGEuY29tMB4GA1UdEgQXMBWB
27
+ E2FnZW50QHN0b3d6aWxsYS5jb20wDQYJKoZIhvcNAQELBQADggGBACm9Fjit/UCv
28
+ FxlKqeiCTIG94cIx+QrWAOJSx9knKydwUec1u04D/DbfZjTn3C2Bj227QgxeUn+6
29
+ if3e2v7zAk1896hLmGYzML0+nxQPb0vmtdLR7HETUlSKTVabcv1fbwLyjsuGrBvk
30
+ y51vOEzUEZ508a9yepLYqrQu1kOju4d57c9oA5l3H0mMKWz7av9tFj0B+STvuaWk
31
+ HRYDWc5HgOEVTyV+w0uFt2Kw4OCb8C42uSvC5RfYYtw78MSP+5Ru+LXJ7XOtmuN0
32
+ E6GVmofQ17ig9O3rgfFbMendSInrRmvPIGswvM1yivq9NOllFbdck2OJKPx6FCJF
33
+ 7SJIkXQfc9P4B5iASIV1d1FsE0YX+g3jHXPJK/4mGL5bAyBKzpMfQB/mg6vQBzkh
34
+ aOKPwcreFj7TznBl89R5tNS9wZQfPVR98zgPyocddWhK18eQNMSBUnv4eeJ8PPbk
35
+ DovL+G8ajHDZ9fjH/+GVYHEMuiVdLarXrKJpHC1VfGTTUAp4NSEpUQ==
36
+ -----END CERTIFICATE-----
10
37
  date: 1980-01-02 00:00:00.000000000 Z
11
38
  dependencies:
12
39
  - !ruby/object:Gem::Dependency
@@ -62,11 +89,13 @@ files:
62
89
  - CHANGELOG.md
63
90
  - LICENSE.txt
64
91
  - README.md
92
+ - certs/stowzilla.pem
65
93
  - lib/s3arch.rb
66
94
  - lib/s3arch/configuration.rb
67
95
  - lib/s3arch/handler.rb
68
96
  - lib/s3arch/indexer.rb
69
97
  - lib/s3arch/searcher.rb
98
+ - lib/s3arch/tokenizer.rb
70
99
  - lib/s3arch/version.rb
71
100
  homepage: https://github.com/stowzilla/s3arch
72
101
  licenses:
@@ -74,7 +103,7 @@ licenses:
74
103
  metadata:
75
104
  rubygems_mfa_required: 'true'
76
105
  homepage_uri: https://github.com/stowzilla/s3arch
77
- source_code_uri: https://github.com/stowzilla/s3arch
106
+ source_code_uri: https://github.com/stowzilla/s3arch/tree/main
78
107
  changelog_uri: https://github.com/stowzilla/s3arch/blob/main/CHANGELOG.md
79
108
  rdoc_options: []
80
109
  require_paths:
@@ -90,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
90
119
  - !ruby/object:Gem::Version
91
120
  version: '0'
92
121
  requirements: []
93
- rubygems_version: 4.0.11
122
+ rubygems_version: 3.6.9
94
123
  specification_version: 4
95
124
  summary: SQLite FTS5 full-text search for DynamoDB on AWS Lambda
96
125
  test_files: []
metadata.gz.sig ADDED
@@ -0,0 +1 @@
1
+ Oco�2��I�?D����0�̌��\�z,V��i���جԻX�� �f���f��$^�o������+"3�R�04q$�#`7���?���E�MF�A� �x���f�����]���~j�]�/�IK�Dz#ni��B[XS������m��%��jQ+�a��_Ghf�a�J�"(o�~|I��y�&��kXڈL �Ϟ�j�Y�<��C���e�:�Pø�A��2X�P��)����nxL�y�<�L�� je�L7���|������_�pG��p�nfPw���P_+�$�N�e�8�!����2�ա�̕g0#����WEdp�8��0�v�#_\G�^lU���(2jǨ��@�o^)��x�R%3���x���|��=���Ǻ