s3arch 0.0.1 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/CHANGELOG.md +16 -0
- data/certs/stowzilla.pem +26 -0
- data/lib/s3arch/configuration.rb +42 -7
- data/lib/s3arch/indexer.rb +245 -35
- data/lib/s3arch/searcher.rb +3 -3
- data/lib/s3arch/tokenizer.rb +41 -0
- data/lib/s3arch/version.rb +1 -1
- data/lib/s3arch.rb +1 -0
- data.tar.gz.sig +2 -0
- metadata +33 -4
- metadata.gz.sig +1 -0
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: cd57db198f9598324792e16e5b84e9c22914ebfdfd4fddeb09fc99641458322c
|
|
4
|
+
data.tar.gz: e107d35cd4b7084d3cf90e3d014bb4fe3c6089475b26419524d9132867565592
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 65aba03e3064bf597dc8bb4e53d7552c07f0b89ebb7ca24b9e6c74a7f5c22f9c95746cec237d1e100fce63cb8f285f782487300abd7f4d1b6e1c76553fdad5ff
|
|
7
|
+
data.tar.gz: 28f5d435ce384c0c1c8e05d17d478ad9fdb9e0a949506350384886fdfcedbfaa2cbd9308dfd08d2faf91dfe4bf02ac07ac6d6749470143914a977f29ab0aef04
|
checksums.yaml.gz.sig
ADDED
|
Binary file
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,21 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.0.2] - 2025-06-08
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
|
|
7
|
+
- `S3arch::Tokenizer` — pure tokenization class for pre-computing search tokens
|
|
8
|
+
- Token-based indexing — indexer reads pre-computed tokens, never raw content
|
|
9
|
+
- Incremental updates via `Indexer#apply_changes` (DynamoDB Stream INSERT/MODIFY/REMOVE)
|
|
10
|
+
- `Indexer#process_event` — processes SQS events containing DynamoDB stream records
|
|
11
|
+
- `Configuration#token_field` — configurable DynamoDB attribute for stored tokens
|
|
12
|
+
- `Configuration#searchable_fields` — configurable list of fields to tokenize
|
|
13
|
+
|
|
14
|
+
### Changed
|
|
15
|
+
|
|
16
|
+
- `S3arch::Indexer` rewritten to support both full rebuild and incremental updates
|
|
17
|
+
- FTS5 contentless DELETE support using stored token values from stream OLD_IMAGE
|
|
18
|
+
|
|
3
19
|
## [0.0.1] - 2025-06-08
|
|
4
20
|
|
|
5
21
|
### Added
|
data/certs/stowzilla.pem
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
-----BEGIN CERTIFICATE-----
|
|
2
|
+
MIIEdDCCAtygAwIBAgIBATANBgkqhkiG9w0BAQsFADBAMQ4wDAYDVQQDDAVhZ2Vu
|
|
3
|
+
dDEZMBcGCgmSJomT8ixkARkWCXN0b3d6aWxsYTETMBEGCgmSJomT8ixkARkWA2Nv
|
|
4
|
+
bTAeFw0yNjA2MDgxOTExNTlaFw0yNzA2MDgxOTExNTlaMEAxDjAMBgNVBAMMBWFn
|
|
5
|
+
ZW50MRkwFwYKCZImiZPyLGQBGRYJc3Rvd3ppbGxhMRMwEQYKCZImiZPyLGQBGRYD
|
|
6
|
+
Y29tMIIBojANBgkqhkiG9w0BAQEFAAOCAY8AMIIBigKCAYEAupBquKI/4WvXOgND
|
|
7
|
+
pXyqH2GllZs1wG4TWWdn/DoMg45UoCwD+AWEuGrIdInBCpPN8vEJNJWPoM/RrU+b
|
|
8
|
+
xRBZT4uUk00bnZRW2SYh5GJSqBoBR+rWc2DGkXyGfdRU2sQvkB0+is6ChgQ61WMM
|
|
9
|
+
33LE9+loBlVsZ6EVtrc18Uh2OW0mJpe0hN2nmBrxZqqOZigxC4DKRMFHvpRkxSb6
|
|
10
|
+
mD4kit1AcwX9NEWJsXxrPaetL/SB/VbXaEZX93XAvp6USaXvCWt4slkDS2mIvqtn
|
|
11
|
+
9DtGC43LFC7SDGbnsG9PVenQgVCi8UWFPUAab0PqZSlmi3Qlbhw8qTGPp5Cbv4vz
|
|
12
|
+
qjC2UGPOQigA/7lbbGRhCohMrjOVHMAQwkcgiIqtolUoYlnvPMIy+m3pdvgDv/PH
|
|
13
|
+
bsZGvXQ7i0458xsmp1vaKthZocVAR+GboHbuIiYPUnO45ccXUQ00x6365tTe7mZi
|
|
14
|
+
NvmUYdAGbQmVvFqyxF7IYA6sF74L2Lstu0knSfss557bAe1HAgMBAAGjeTB3MAkG
|
|
15
|
+
A1UdEwQCMAAwCwYDVR0PBAQDAgSwMB0GA1UdDgQWBBSnxTL/lNBCeLqpeVIX6AUY
|
|
16
|
+
kel4zjAeBgNVHREEFzAVgRNhZ2VudEBzdG93emlsbGEuY29tMB4GA1UdEgQXMBWB
|
|
17
|
+
E2FnZW50QHN0b3d6aWxsYS5jb20wDQYJKoZIhvcNAQELBQADggGBACm9Fjit/UCv
|
|
18
|
+
FxlKqeiCTIG94cIx+QrWAOJSx9knKydwUec1u04D/DbfZjTn3C2Bj227QgxeUn+6
|
|
19
|
+
if3e2v7zAk1896hLmGYzML0+nxQPb0vmtdLR7HETUlSKTVabcv1fbwLyjsuGrBvk
|
|
20
|
+
y51vOEzUEZ508a9yepLYqrQu1kOju4d57c9oA5l3H0mMKWz7av9tFj0B+STvuaWk
|
|
21
|
+
HRYDWc5HgOEVTyV+w0uFt2Kw4OCb8C42uSvC5RfYYtw78MSP+5Ru+LXJ7XOtmuN0
|
|
22
|
+
E6GVmofQ17ig9O3rgfFbMendSInrRmvPIGswvM1yivq9NOllFbdck2OJKPx6FCJF
|
|
23
|
+
7SJIkXQfc9P4B5iASIV1d1FsE0YX+g3jHXPJK/4mGL5bAyBKzpMfQB/mg6vQBzkh
|
|
24
|
+
aOKPwcreFj7TznBl89R5tNS9wZQfPVR98zgPyocddWhK18eQNMSBUnv4eeJ8PPbk
|
|
25
|
+
DovL+G8ajHDZ9fjH/+GVYHEMuiVdLarXrKJpHC1VfGTTUAp4NSEpUQ==
|
|
26
|
+
-----END CERTIFICATE-----
|
data/lib/s3arch/configuration.rb
CHANGED
|
@@ -2,16 +2,50 @@
|
|
|
2
2
|
|
|
3
3
|
module S3arch
|
|
4
4
|
class Configuration
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
5
|
+
# DynamoDB table that contains the source records to index
|
|
6
|
+
attr_accessor :source_table
|
|
7
|
+
|
|
8
|
+
# DynamoDB index to query records by owner (e.g., 'UserIndex')
|
|
9
|
+
attr_accessor :source_index
|
|
10
|
+
|
|
11
|
+
# Partition key field on the source table for owner lookup
|
|
12
|
+
attr_accessor :owner_key
|
|
13
|
+
|
|
14
|
+
# S3 bucket for storing SQLite index files
|
|
15
|
+
attr_accessor :index_bucket
|
|
16
|
+
|
|
17
|
+
# DynamoDB table for version tracking
|
|
18
|
+
attr_accessor :version_table
|
|
19
|
+
|
|
20
|
+
# FTS5 searchable fields — array of field names from the source record
|
|
21
|
+
attr_accessor :searchable_fields
|
|
22
|
+
|
|
23
|
+
# DynamoDB attribute name where pre-computed tokens are stored (Map type)
|
|
24
|
+
# e.g., { "searchTokens": { "name": "blue jacket", "description": "warm winter coat" } }
|
|
25
|
+
attr_accessor :token_field
|
|
26
|
+
|
|
27
|
+
# Metadata fields stored alongside FTS5 for filtering (not searched)
|
|
28
|
+
attr_accessor :metadata_fields
|
|
29
|
+
|
|
30
|
+
# Filter proc — receives a record hash, returns true to include in index
|
|
31
|
+
attr_accessor :record_filter
|
|
32
|
+
|
|
33
|
+
# Owner extractor — proc that extracts owner_id from a DynamoDB stream record
|
|
34
|
+
attr_accessor :owner_extractor
|
|
35
|
+
|
|
36
|
+
# Logger (defaults to $stdout)
|
|
37
|
+
attr_accessor :logger
|
|
38
|
+
|
|
39
|
+
# Searcher settings
|
|
40
|
+
attr_accessor :version_ttl, :max_results, :max_cached_dbs, :ephemeral_storage_mb
|
|
8
41
|
|
|
9
42
|
def initialize
|
|
10
43
|
@owner_key = 'user_id'
|
|
11
44
|
@searchable_fields = %w[name description]
|
|
45
|
+
@token_field = 'searchTokens'
|
|
12
46
|
@metadata_fields = %w[status created_at]
|
|
13
47
|
@record_filter = ->(_record) { true }
|
|
14
|
-
@owner_extractor =
|
|
48
|
+
@owner_extractor = lambda { |stream_record|
|
|
15
49
|
image = stream_record.dig('dynamodb', 'NewImage') || stream_record.dig('dynamodb', 'OldImage') || {}
|
|
16
50
|
image.dig(owner_key, 'S')
|
|
17
51
|
}
|
|
@@ -22,11 +56,12 @@ module S3arch
|
|
|
22
56
|
@logger = nil
|
|
23
57
|
end
|
|
24
58
|
|
|
59
|
+
# Convenience: env-based configuration (reads from Lambda environment variables)
|
|
25
60
|
def from_env!
|
|
26
|
-
@source_table = ENV['S3ARCH_SOURCE_TABLE']
|
|
61
|
+
@source_table = ENV['S3ARCH_SOURCE_TABLE'] || ENV.fetch('INVENTORY_TABLE', nil)
|
|
27
62
|
@source_index = ENV['S3ARCH_SOURCE_INDEX'] || 'UserIndex'
|
|
28
|
-
@index_bucket = ENV['S3ARCH_INDEX_BUCKET']
|
|
29
|
-
@version_table = ENV['S3ARCH_VERSION_TABLE']
|
|
63
|
+
@index_bucket = ENV['S3ARCH_INDEX_BUCKET'] || ENV.fetch('SEARCH_INDEX_BUCKET', nil)
|
|
64
|
+
@version_table = ENV['S3ARCH_VERSION_TABLE'] || ENV.fetch('SEARCH_INDEX_TABLE', nil)
|
|
30
65
|
self
|
|
31
66
|
end
|
|
32
67
|
|
data/lib/s3arch/indexer.rb
CHANGED
|
@@ -6,6 +6,9 @@ require 'sqlite3'
|
|
|
6
6
|
require 'json'
|
|
7
7
|
|
|
8
8
|
module S3arch
|
|
9
|
+
# Builds SQLite FTS5 databases per owner from pre-computed tokens stored in DynamoDB.
|
|
10
|
+
# The indexer never sees raw content — only tokens. Supports incremental updates via
|
|
11
|
+
# DynamoDB Stream events (INSERT/MODIFY/REMOVE).
|
|
9
12
|
class Indexer
|
|
10
13
|
def initialize(config: S3arch.configuration)
|
|
11
14
|
config.validate!
|
|
@@ -14,6 +17,8 @@ module S3arch
|
|
|
14
17
|
@s3 = Aws::S3::Client.new
|
|
15
18
|
end
|
|
16
19
|
|
|
20
|
+
# Full rebuild — pulls all tokens from DynamoDB for an owner.
|
|
21
|
+
# Used for initial backfill or when incremental isn't possible.
|
|
17
22
|
def rebuild(owner_id)
|
|
18
23
|
records = fetch_records(owner_id)
|
|
19
24
|
db_path = "/tmp/s3arch_#{owner_id}.sqlite3"
|
|
@@ -27,38 +32,255 @@ module S3arch
|
|
|
27
32
|
File.delete(db_path) if db_path && File.exist?(db_path)
|
|
28
33
|
end
|
|
29
34
|
|
|
35
|
+
# Incremental update — applies INSERT/DELETE/UPDATE to an existing index.
|
|
36
|
+
# Downloads current DB from S3, applies changes, re-uploads.
|
|
37
|
+
def apply_changes(owner_id, changes)
|
|
38
|
+
db_path = "/tmp/s3arch_#{owner_id}.sqlite3"
|
|
39
|
+
download_existing(owner_id, db_path)
|
|
40
|
+
|
|
41
|
+
unless File.exist?(db_path)
|
|
42
|
+
log(:info, 'No existing index, doing full rebuild', owner_id: owner_id)
|
|
43
|
+
return rebuild(owner_id)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
db = SQLite3::Database.new(db_path)
|
|
47
|
+
db.results_as_hash = true
|
|
48
|
+
|
|
49
|
+
db.transaction do
|
|
50
|
+
changes.each { |change| apply_change(db, change) }
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
record_count = db.get_first_value('SELECT COUNT(*) FROM records_meta')
|
|
54
|
+
db.close
|
|
55
|
+
|
|
56
|
+
upload(owner_id, db_path)
|
|
57
|
+
increment_version(owner_id, record_count)
|
|
58
|
+
|
|
59
|
+
log(:info, 'Index updated incrementally', owner_id: owner_id, changes: changes.size, record_count: record_count)
|
|
60
|
+
ensure
|
|
61
|
+
File.delete(db_path) if db_path && File.exist?(db_path)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Process SQS event containing DynamoDB stream records.
|
|
65
|
+
# Groups by owner and applies incremental changes.
|
|
30
66
|
def process_event(event)
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
body = JSON.parse(r['body'])
|
|
34
|
-
@config.owner_extractor.call(body)
|
|
35
|
-
}.uniq
|
|
67
|
+
sqs_records = event['Records'] || []
|
|
68
|
+
grouped = group_changes(sqs_records)
|
|
36
69
|
|
|
37
|
-
log(:info, '
|
|
38
|
-
|
|
39
|
-
|
|
70
|
+
log(:info, 'Processing stream events', owner_count: grouped.size, record_count: sqs_records.size)
|
|
71
|
+
|
|
72
|
+
grouped.each do |owner_id, changes|
|
|
73
|
+
if changes.any? { |c| c[:action] == :rebuild }
|
|
74
|
+
rebuild(owner_id)
|
|
75
|
+
else
|
|
76
|
+
apply_changes(owner_id, changes)
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
{ statusCode: 200, body: JSON.generate(rebuilt: grouped.size) }
|
|
40
81
|
end
|
|
41
82
|
|
|
42
83
|
private
|
|
43
84
|
|
|
85
|
+
def group_changes(sqs_records)
|
|
86
|
+
grouped = Hash.new { |h, k| h[k] = [] }
|
|
87
|
+
|
|
88
|
+
sqs_records.each do |sqs_record|
|
|
89
|
+
stream_record = JSON.parse(sqs_record['body'])
|
|
90
|
+
event_name = stream_record['eventName']
|
|
91
|
+
new_image = stream_record.dig('dynamodb', 'NewImage')
|
|
92
|
+
old_image = stream_record.dig('dynamodb', 'OldImage')
|
|
93
|
+
|
|
94
|
+
owner_id = extract_owner(new_image || old_image)
|
|
95
|
+
next unless owner_id
|
|
96
|
+
|
|
97
|
+
change = build_change(event_name, new_image, old_image)
|
|
98
|
+
grouped[owner_id] << change if change
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
grouped
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def build_change(event_name, new_image, old_image)
|
|
105
|
+
case event_name
|
|
106
|
+
when 'INSERT'
|
|
107
|
+
tokens = extract_tokens(new_image)
|
|
108
|
+
record_id = extract_record_id(new_image)
|
|
109
|
+
return nil unless tokens && record_id && passes_filter?(new_image)
|
|
110
|
+
|
|
111
|
+
{ action: :insert, record_id: record_id, tokens: tokens, meta: extract_meta(new_image) }
|
|
112
|
+
when 'REMOVE'
|
|
113
|
+
tokens = extract_tokens(old_image)
|
|
114
|
+
record_id = extract_record_id(old_image)
|
|
115
|
+
return nil unless tokens && record_id
|
|
116
|
+
|
|
117
|
+
{ action: :delete, record_id: record_id, tokens: tokens }
|
|
118
|
+
when 'MODIFY'
|
|
119
|
+
old_tokens = extract_tokens(old_image)
|
|
120
|
+
new_tokens = extract_tokens(new_image)
|
|
121
|
+
record_id = extract_record_id(new_image)
|
|
122
|
+
return nil unless record_id
|
|
123
|
+
|
|
124
|
+
# If item no longer passes filter, treat as delete
|
|
125
|
+
unless passes_filter?(new_image)
|
|
126
|
+
return old_tokens ? { action: :delete, record_id: record_id, tokens: old_tokens } : nil
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# If item previously didn't pass filter (no old tokens), treat as insert
|
|
130
|
+
unless old_tokens
|
|
131
|
+
return new_tokens ? { action: :insert, record_id: record_id, tokens: new_tokens,
|
|
132
|
+
meta: extract_meta(new_image) } : nil
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
return nil unless new_tokens
|
|
136
|
+
|
|
137
|
+
{ action: :update, record_id: record_id, old_tokens: old_tokens, new_tokens: new_tokens,
|
|
138
|
+
meta: extract_meta(new_image) }
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def extract_owner(image)
|
|
143
|
+
return nil unless image
|
|
144
|
+
|
|
145
|
+
val = image[@config.owner_key]
|
|
146
|
+
val.is_a?(Hash) ? val['S'] : val
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
def extract_record_id(image)
|
|
150
|
+
return nil unless image
|
|
151
|
+
|
|
152
|
+
val = image['id']
|
|
153
|
+
val.is_a?(Hash) ? val['S'] : val
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
def extract_tokens(image)
|
|
157
|
+
return nil unless image
|
|
158
|
+
|
|
159
|
+
val = image[@config.token_field]
|
|
160
|
+
return nil unless val
|
|
161
|
+
|
|
162
|
+
# Token field is a DynamoDB Map: { "M": { "name": { "S": "..." }, "description": { "S": "..." } } }
|
|
163
|
+
if val.is_a?(Hash) && val.key?('M')
|
|
164
|
+
val['M'].transform_values { |v| v.is_a?(Hash) ? (v['S'] || '') : v.to_s }
|
|
165
|
+
elsif val.is_a?(Hash) && !val.key?('S')
|
|
166
|
+
val.transform_values { |v| v.is_a?(Hash) ? (v['S'] || '') : v.to_s }
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
def extract_meta(image)
|
|
171
|
+
@config.metadata_fields.each_with_object({}) do |field, meta|
|
|
172
|
+
val = image[field]
|
|
173
|
+
meta[field] = val.is_a?(Hash) ? (val['S'] || val['N'] || '') : val.to_s
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
def passes_filter?(image)
|
|
178
|
+
# Convert DynamoDB image to plain hash for filter
|
|
179
|
+
plain = image.transform_values { |v| v.is_a?(Hash) ? (v['S'] || v['N'] || v['BOOL']&.to_s || '') : v }
|
|
180
|
+
@config.record_filter.call(plain)
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
def apply_change(db, change)
|
|
184
|
+
case change[:action]
|
|
185
|
+
when :insert
|
|
186
|
+
rowid = next_rowid(db)
|
|
187
|
+
insert_row(db, rowid, change[:record_id], change[:tokens], change[:meta])
|
|
188
|
+
when :delete
|
|
189
|
+
rowid = find_rowid(db, change[:record_id])
|
|
190
|
+
delete_row(db, rowid, change[:tokens]) if rowid
|
|
191
|
+
when :update
|
|
192
|
+
rowid = find_rowid(db, change[:record_id])
|
|
193
|
+
if rowid
|
|
194
|
+
delete_row(db, rowid, change[:old_tokens])
|
|
195
|
+
insert_row(db, rowid, change[:record_id], change[:new_tokens], change[:meta])
|
|
196
|
+
else
|
|
197
|
+
# Record wasn't in index yet, just insert
|
|
198
|
+
new_rowid = next_rowid(db)
|
|
199
|
+
insert_row(db, new_rowid, change[:record_id], change[:new_tokens], change[:meta])
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
def insert_row(db, rowid, record_id, tokens, meta)
|
|
205
|
+
fts_cols = @config.searchable_fields
|
|
206
|
+
fts_values = fts_cols.map { |f| tokens[f] || '' }
|
|
207
|
+
placeholders = (['?'] * (fts_values.size + 1)).join(', ')
|
|
208
|
+
db.execute("INSERT INTO records_fts(rowid, #{fts_cols.join(', ')}) VALUES (#{placeholders})",
|
|
209
|
+
[rowid] + fts_values)
|
|
210
|
+
|
|
211
|
+
meta ||= {}
|
|
212
|
+
meta_values = [rowid, record_id] + @config.metadata_fields.map { |f| meta[f] || '' }
|
|
213
|
+
meta_placeholders = (['?'] * meta_values.size).join(', ')
|
|
214
|
+
meta_cols = "rowid, record_id, #{@config.metadata_fields.join(', ')}"
|
|
215
|
+
db.execute("INSERT INTO records_meta(#{meta_cols}) VALUES (#{meta_placeholders})", meta_values)
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
def delete_row(db, rowid, tokens)
|
|
219
|
+
fts_cols = @config.searchable_fields
|
|
220
|
+
fts_values = fts_cols.map { |f| tokens[f] || '' }
|
|
221
|
+
placeholders = (['?'] * (fts_values.size + 1)).join(', ')
|
|
222
|
+
# FTS5 contentless delete: INSERT with special 'delete' command
|
|
223
|
+
fts_delete_sql = "INSERT INTO records_fts(records_fts, rowid, #{fts_cols.join(', ')}) " \
|
|
224
|
+
"VALUES ('delete', #{placeholders})"
|
|
225
|
+
db.execute(fts_delete_sql, [rowid] + fts_values)
|
|
226
|
+
db.execute('DELETE FROM records_meta WHERE rowid = ?', [rowid])
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
def find_rowid(db, record_id)
|
|
230
|
+
db.get_first_value('SELECT rowid FROM records_meta WHERE record_id = ?', [record_id])
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
def next_rowid(db)
|
|
234
|
+
max = db.get_first_value('SELECT MAX(rowid) FROM records_meta')
|
|
235
|
+
(max || 0) + 1
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
def download_existing(owner_id, db_path)
|
|
239
|
+
@s3.get_object(bucket: @config.index_bucket, key: "#{owner_id}/index.sqlite3", response_target: db_path)
|
|
240
|
+
rescue Aws::S3::Errors::NoSuchKey
|
|
241
|
+
# No existing index — caller will fall back to rebuild
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
# Full rebuild: fetches token field from DynamoDB (never reads content)
|
|
44
245
|
def fetch_records(owner_id)
|
|
45
246
|
records = []
|
|
247
|
+
fields_to_project = ['id', @config.token_field, @config.owner_key] + @config.metadata_fields + filter_fields
|
|
46
248
|
params = { table_name: @config.source_table, index_name: @config.source_index,
|
|
47
249
|
key_condition_expression: "#{@config.owner_key} = :owner",
|
|
48
|
-
expression_attribute_values: { ':owner' => owner_id }
|
|
250
|
+
expression_attribute_values: { ':owner' => owner_id },
|
|
251
|
+
projection_expression: fields_to_project.uniq.join(', ') }
|
|
49
252
|
|
|
50
253
|
loop do
|
|
51
254
|
result = @dynamodb.query(params)
|
|
52
|
-
result.items.each
|
|
255
|
+
result.items.each do |item|
|
|
256
|
+
next unless @config.record_filter.call(item)
|
|
257
|
+
|
|
258
|
+
tokens = item[@config.token_field]
|
|
259
|
+
next unless tokens.is_a?(Hash) && tokens.any?
|
|
260
|
+
|
|
261
|
+
records << { 'id' => item['id'], 'tokens' => tokens, 'meta' => extract_meta_from_item(item) }
|
|
262
|
+
end
|
|
53
263
|
break unless result.last_evaluated_key
|
|
264
|
+
|
|
54
265
|
params[:exclusive_start_key] = result.last_evaluated_key
|
|
55
266
|
end
|
|
56
267
|
|
|
57
268
|
records
|
|
58
269
|
end
|
|
59
270
|
|
|
271
|
+
def extract_meta_from_item(item)
|
|
272
|
+
@config.metadata_fields.to_h do |field|
|
|
273
|
+
[field, item[field].to_s]
|
|
274
|
+
end
|
|
275
|
+
end
|
|
276
|
+
|
|
277
|
+
def filter_fields
|
|
278
|
+
# Fields needed by the record_filter (best-effort — add status/bin_id for default filter)
|
|
279
|
+
%w[status bin_id]
|
|
280
|
+
end
|
|
281
|
+
|
|
60
282
|
def build_database(db_path, records)
|
|
61
|
-
|
|
283
|
+
FileUtils.rm_f(db_path)
|
|
62
284
|
db = SQLite3::Database.new(db_path)
|
|
63
285
|
|
|
64
286
|
fts_cols = @config.searchable_fields.join(', ')
|
|
@@ -73,35 +295,21 @@ module S3arch
|
|
|
73
295
|
db.transaction do
|
|
74
296
|
records.each_with_index do |record, idx|
|
|
75
297
|
rowid = idx + 1
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
298
|
+
tokens = record['tokens']
|
|
299
|
+
fts_values = @config.searchable_fields.map { |f| tokens[f] || '' }
|
|
300
|
+
placeholders = (['?'] * (fts_values.size + 1)).join(', ')
|
|
301
|
+
db.execute("INSERT INTO records_fts(rowid, #{fts_cols}) VALUES (#{placeholders})", [rowid] + fts_values)
|
|
302
|
+
|
|
303
|
+
meta_values = [rowid, record['id']] + @config.metadata_fields.map { |f| record['meta'][f] || '' }
|
|
304
|
+
meta_placeholders = (['?'] * meta_values.size).join(', ')
|
|
305
|
+
meta_cols = "rowid, record_id, #{@config.metadata_fields.join(', ')}"
|
|
306
|
+
db.execute("INSERT INTO records_meta(#{meta_cols}) VALUES (#{meta_placeholders})", meta_values)
|
|
84
307
|
end
|
|
85
308
|
end
|
|
86
309
|
|
|
87
310
|
db.close
|
|
88
311
|
end
|
|
89
312
|
|
|
90
|
-
def normalize_field(value)
|
|
91
|
-
case value
|
|
92
|
-
when Hash
|
|
93
|
-
if value.key?('S') then value['S'].to_s
|
|
94
|
-
elsif value.key?('L') then value['L'].map { |v| v['S'] || v.to_s }.join(' ')
|
|
95
|
-
elsif value.key?('N') then value['N'].to_s
|
|
96
|
-
elsif value.key?('SS') then value['SS'].join(' ')
|
|
97
|
-
else value.values.first.to_s
|
|
98
|
-
end
|
|
99
|
-
when Array then value.map { |v| v.is_a?(Hash) ? (v['S'] || v.to_s) : v.to_s }.join(' ')
|
|
100
|
-
when nil then ''
|
|
101
|
-
else value.to_s
|
|
102
|
-
end
|
|
103
|
-
end
|
|
104
|
-
|
|
105
313
|
def upload(owner_id, db_path)
|
|
106
314
|
@s3.put_object(bucket: @config.index_bucket, key: "#{owner_id}/index.sqlite3", body: File.open(db_path, 'rb'))
|
|
107
315
|
end
|
|
@@ -110,13 +318,15 @@ module S3arch
|
|
|
110
318
|
@dynamodb.update_item(
|
|
111
319
|
table_name: @config.version_table,
|
|
112
320
|
key: { @config.owner_key => owner_id },
|
|
113
|
-
update_expression: 'SET version = if_not_exists(version, :zero) + :one,
|
|
321
|
+
update_expression: 'SET version = if_not_exists(version, :zero) + :one, ' \
|
|
322
|
+
'updated_at = :now, record_count = :count',
|
|
114
323
|
expression_attribute_values: { ':zero' => 0, ':one' => 1, ':now' => Time.now.iso8601, ':count' => record_count }
|
|
115
324
|
)
|
|
116
325
|
end
|
|
117
326
|
|
|
118
327
|
def log(level, message, **data)
|
|
119
328
|
return unless @config.logger
|
|
329
|
+
|
|
120
330
|
@config.logger.send(level, message, **data)
|
|
121
331
|
end
|
|
122
332
|
end
|
data/lib/s3arch/searcher.rb
CHANGED
|
@@ -72,9 +72,7 @@ module S3arch
|
|
|
72
72
|
|
|
73
73
|
def fetch_version(owner_id)
|
|
74
74
|
cached = self.class.version_cache[owner_id]
|
|
75
|
-
if cached && (Time.now - cached[:checked_at]) < @config.version_ttl
|
|
76
|
-
return cached[:version]
|
|
77
|
-
end
|
|
75
|
+
return cached[:version] if cached && (Time.now - cached[:checked_at]) < @config.version_ttl
|
|
78
76
|
|
|
79
77
|
result = @dynamodb.get_item(table_name: @config.version_table,
|
|
80
78
|
key: { @config.owner_key => owner_id },
|
|
@@ -112,6 +110,7 @@ module S3arch
|
|
|
112
110
|
rows = db.execute(sql, [match_expr])
|
|
113
111
|
rows.filter_map do |row|
|
|
114
112
|
next if filters.any? { |field, value| row[field.to_s] != value }
|
|
113
|
+
|
|
115
114
|
{ record_id: row['record_id'], rank: row['rank'] }
|
|
116
115
|
end
|
|
117
116
|
rescue SQLite3::Exception => e
|
|
@@ -131,6 +130,7 @@ module S3arch
|
|
|
131
130
|
|
|
132
131
|
def log(level, message, **data)
|
|
133
132
|
return unless @config.logger
|
|
133
|
+
|
|
134
134
|
@config.logger.send(level, message, **data)
|
|
135
135
|
end
|
|
136
136
|
end
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module S3arch
|
|
4
|
+
# Pure tokenization — generates the token string stored in DynamoDB.
|
|
5
|
+
# At query time, FTS5 tokenizes the same way internally, so prefix matches work.
|
|
6
|
+
class Tokenizer
|
|
7
|
+
def initialize(fields: S3arch.configuration.searchable_fields)
|
|
8
|
+
@fields = fields
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
# Accepts a record hash, returns a hash of { field => tokenized_string }
|
|
12
|
+
# This is what gets stored in DynamoDB and fed directly into FTS5.
|
|
13
|
+
def tokenize(record)
|
|
14
|
+
@fields.to_h do |field|
|
|
15
|
+
[field, normalize(record[field])]
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# Flattened single-string version for simple storage (all fields concatenated)
|
|
20
|
+
def tokenize_flat(record)
|
|
21
|
+
@fields.map { |f| normalize(record[f]) }.reject(&:empty?).join(' ')
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
private
|
|
25
|
+
|
|
26
|
+
def normalize(value)
|
|
27
|
+
case value
|
|
28
|
+
when Hash
|
|
29
|
+
if value.key?('S') then value['S'].to_s
|
|
30
|
+
elsif value.key?('L') then value['L'].map { |v| v['S'] || v.to_s }.join(' ')
|
|
31
|
+
elsif value.key?('N') then value['N'].to_s
|
|
32
|
+
elsif value.key?('SS') then value['SS'].join(' ')
|
|
33
|
+
else value.values.first.to_s
|
|
34
|
+
end
|
|
35
|
+
when Array then value.map { |v| v.is_a?(Hash) ? (v['S'] || v.to_s) : v.to_s }.join(' ')
|
|
36
|
+
when nil then ''
|
|
37
|
+
else value.to_s
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
data/lib/s3arch/version.rb
CHANGED
data/lib/s3arch.rb
CHANGED
data.tar.gz.sig
ADDED
metadata
CHANGED
|
@@ -1,12 +1,39 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: s3arch
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Adam Dalton
|
|
8
8
|
bindir: bin
|
|
9
|
-
cert_chain:
|
|
9
|
+
cert_chain:
|
|
10
|
+
- |
|
|
11
|
+
-----BEGIN CERTIFICATE-----
|
|
12
|
+
MIIEdDCCAtygAwIBAgIBATANBgkqhkiG9w0BAQsFADBAMQ4wDAYDVQQDDAVhZ2Vu
|
|
13
|
+
dDEZMBcGCgmSJomT8ixkARkWCXN0b3d6aWxsYTETMBEGCgmSJomT8ixkARkWA2Nv
|
|
14
|
+
bTAeFw0yNjA2MDgxOTExNTlaFw0yNzA2MDgxOTExNTlaMEAxDjAMBgNVBAMMBWFn
|
|
15
|
+
ZW50MRkwFwYKCZImiZPyLGQBGRYJc3Rvd3ppbGxhMRMwEQYKCZImiZPyLGQBGRYD
|
|
16
|
+
Y29tMIIBojANBgkqhkiG9w0BAQEFAAOCAY8AMIIBigKCAYEAupBquKI/4WvXOgND
|
|
17
|
+
pXyqH2GllZs1wG4TWWdn/DoMg45UoCwD+AWEuGrIdInBCpPN8vEJNJWPoM/RrU+b
|
|
18
|
+
xRBZT4uUk00bnZRW2SYh5GJSqBoBR+rWc2DGkXyGfdRU2sQvkB0+is6ChgQ61WMM
|
|
19
|
+
33LE9+loBlVsZ6EVtrc18Uh2OW0mJpe0hN2nmBrxZqqOZigxC4DKRMFHvpRkxSb6
|
|
20
|
+
mD4kit1AcwX9NEWJsXxrPaetL/SB/VbXaEZX93XAvp6USaXvCWt4slkDS2mIvqtn
|
|
21
|
+
9DtGC43LFC7SDGbnsG9PVenQgVCi8UWFPUAab0PqZSlmi3Qlbhw8qTGPp5Cbv4vz
|
|
22
|
+
qjC2UGPOQigA/7lbbGRhCohMrjOVHMAQwkcgiIqtolUoYlnvPMIy+m3pdvgDv/PH
|
|
23
|
+
bsZGvXQ7i0458xsmp1vaKthZocVAR+GboHbuIiYPUnO45ccXUQ00x6365tTe7mZi
|
|
24
|
+
NvmUYdAGbQmVvFqyxF7IYA6sF74L2Lstu0knSfss557bAe1HAgMBAAGjeTB3MAkG
|
|
25
|
+
A1UdEwQCMAAwCwYDVR0PBAQDAgSwMB0GA1UdDgQWBBSnxTL/lNBCeLqpeVIX6AUY
|
|
26
|
+
kel4zjAeBgNVHREEFzAVgRNhZ2VudEBzdG93emlsbGEuY29tMB4GA1UdEgQXMBWB
|
|
27
|
+
E2FnZW50QHN0b3d6aWxsYS5jb20wDQYJKoZIhvcNAQELBQADggGBACm9Fjit/UCv
|
|
28
|
+
FxlKqeiCTIG94cIx+QrWAOJSx9knKydwUec1u04D/DbfZjTn3C2Bj227QgxeUn+6
|
|
29
|
+
if3e2v7zAk1896hLmGYzML0+nxQPb0vmtdLR7HETUlSKTVabcv1fbwLyjsuGrBvk
|
|
30
|
+
y51vOEzUEZ508a9yepLYqrQu1kOju4d57c9oA5l3H0mMKWz7av9tFj0B+STvuaWk
|
|
31
|
+
HRYDWc5HgOEVTyV+w0uFt2Kw4OCb8C42uSvC5RfYYtw78MSP+5Ru+LXJ7XOtmuN0
|
|
32
|
+
E6GVmofQ17ig9O3rgfFbMendSInrRmvPIGswvM1yivq9NOllFbdck2OJKPx6FCJF
|
|
33
|
+
7SJIkXQfc9P4B5iASIV1d1FsE0YX+g3jHXPJK/4mGL5bAyBKzpMfQB/mg6vQBzkh
|
|
34
|
+
aOKPwcreFj7TznBl89R5tNS9wZQfPVR98zgPyocddWhK18eQNMSBUnv4eeJ8PPbk
|
|
35
|
+
DovL+G8ajHDZ9fjH/+GVYHEMuiVdLarXrKJpHC1VfGTTUAp4NSEpUQ==
|
|
36
|
+
-----END CERTIFICATE-----
|
|
10
37
|
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
38
|
dependencies:
|
|
12
39
|
- !ruby/object:Gem::Dependency
|
|
@@ -62,11 +89,13 @@ files:
|
|
|
62
89
|
- CHANGELOG.md
|
|
63
90
|
- LICENSE.txt
|
|
64
91
|
- README.md
|
|
92
|
+
- certs/stowzilla.pem
|
|
65
93
|
- lib/s3arch.rb
|
|
66
94
|
- lib/s3arch/configuration.rb
|
|
67
95
|
- lib/s3arch/handler.rb
|
|
68
96
|
- lib/s3arch/indexer.rb
|
|
69
97
|
- lib/s3arch/searcher.rb
|
|
98
|
+
- lib/s3arch/tokenizer.rb
|
|
70
99
|
- lib/s3arch/version.rb
|
|
71
100
|
homepage: https://github.com/stowzilla/s3arch
|
|
72
101
|
licenses:
|
|
@@ -74,7 +103,7 @@ licenses:
|
|
|
74
103
|
metadata:
|
|
75
104
|
rubygems_mfa_required: 'true'
|
|
76
105
|
homepage_uri: https://github.com/stowzilla/s3arch
|
|
77
|
-
source_code_uri: https://github.com/stowzilla/s3arch
|
|
106
|
+
source_code_uri: https://github.com/stowzilla/s3arch/tree/main
|
|
78
107
|
changelog_uri: https://github.com/stowzilla/s3arch/blob/main/CHANGELOG.md
|
|
79
108
|
rdoc_options: []
|
|
80
109
|
require_paths:
|
|
@@ -90,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
90
119
|
- !ruby/object:Gem::Version
|
|
91
120
|
version: '0'
|
|
92
121
|
requirements: []
|
|
93
|
-
rubygems_version:
|
|
122
|
+
rubygems_version: 3.6.9
|
|
94
123
|
specification_version: 4
|
|
95
124
|
summary: SQLite FTS5 full-text search for DynamoDB on AWS Lambda
|
|
96
125
|
test_files: []
|
metadata.gz.sig
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
Oco�2��I�?D����0�̌��\�z,V��i���جԻX���f���f��$^�o������+"3�R�04q$�#`7���?���E�MF�A� �x���f�����]���~j�]�/�IK�Dz#ni��B[XS������m��%��jQ+�a��_Ghf�a�J�"(o�~|I��y�&��kXڈL �Ϟ�j�Y�<��C���e�:�Pø�A��2X�P��)����nxL�y�<�L��je�L7���|������_�pG��p�nfPw���P_+�$�N�e�8�!����2�ա�̕g0#����WEdp�8��0�v�#_\G�^lU���(2jǨ��@�o^)��x�R%3���x���|��=���Ǻ
|