s3arch 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +14 -0
- data/LICENSE.txt +21 -0
- data/README.md +152 -0
- data/lib/s3arch/configuration.rb +42 -0
- data/lib/s3arch/handler.rb +24 -0
- data/lib/s3arch/indexer.rb +123 -0
- data/lib/s3arch/searcher.rb +137 -0
- data/lib/s3arch/version.rb +5 -0
- data/lib/s3arch.rb +25 -0
- metadata +96 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 1d5bd8c8d47da22b8feb308ff8fbc96205d554ae915ef8e4e2831d5ba534f786
|
|
4
|
+
data.tar.gz: 4741157e9c522074a08c3478dc4cfddd16445590cfdb107100121083ff34a29a
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 54b5a57df4c767d23eac10d7984ea5ec796f60cfe9047b24619cb3d67600af2b67e71e2f0639704a55939a057f7351f7778a0587b9db6b80fa768eb2398c9e18
|
|
7
|
+
data.tar.gz: c4a2ff82ac4985b24376ccacab235e9a56132e1ee10b77202eff9defc62d0b1765ab9da31d89c695b42c3560984d1098df476a6b73b6a4561ad6c1cbefaa4749
|
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## [0.0.1] - 2025-06-08
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
|
|
7
|
+
- Initial release
|
|
8
|
+
- `S3arch::Indexer` — builds per-owner SQLite FTS5 databases from DynamoDB records
|
|
9
|
+
- `S3arch::Searcher` — queries per-owner indexes from Lambda `/tmp` with LRU caching
|
|
10
|
+
- `S3arch::Handler` — pre-built Lambda handler methods for indexer and search
|
|
11
|
+
- `S3arch::Configuration` — declarative configuration with `from_env!` convenience
|
|
12
|
+
- DynamoDB version tracking for cache invalidation
|
|
13
|
+
- EventBridge Pipe architecture (DynamoDB Streams → SQS → Indexer Lambda)
|
|
14
|
+
- Terraform module for infrastructure provisioning
|
data/LICENSE.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Stowzilla
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
# S3arch
|
|
2
|
+
|
|
3
|
+
Full-text search for DynamoDB on AWS Lambda using SQLite FTS5. Per-owner indexes stored on S3, queried from Lambda `/tmp` with LRU eviction and DynamoDB version tracking.
|
|
4
|
+
|
|
5
|
+
## Why?
|
|
6
|
+
|
|
7
|
+
DynamoDB doesn't have native full-text search. OpenSearch/Elasticsearch is expensive and complex for many use cases. S3arch gives you fast, typo-tolerant search with zero infrastructure beyond what you already have (Lambda, S3, DynamoDB).
|
|
8
|
+
|
|
9
|
+
**How it works:**
|
|
10
|
+
|
|
11
|
+
```
|
|
12
|
+
DynamoDB (source) → Stream → EventBridge Pipe → SQS → Indexer Lambda
|
|
13
|
+
↓
|
|
14
|
+
SQLite FTS5 DB
|
|
15
|
+
↓
|
|
16
|
+
S3 (per-owner)
|
|
17
|
+
↓
|
|
18
|
+
Search Lambda ← /tmp cache (LRU)
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
Each owner (user, tenant, account) gets their own SQLite database. The indexer rebuilds it on every change. The searcher downloads it to `/tmp` on first query, then serves subsequent queries from the warm cache until the version changes.
|
|
22
|
+
|
|
23
|
+
## Installation
|
|
24
|
+
|
|
25
|
+
```ruby
|
|
26
|
+
gem 's3arch'
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
Requires the `sqlite3` native extension available at runtime. On Lambda, use a layer that provides it (e.g., [stowzilla-sqlite3-ruby](https://github.com/stowzilla/stowzilla-sqlite3-ruby)).
|
|
30
|
+
|
|
31
|
+
## Usage
|
|
32
|
+
|
|
33
|
+
### Configuration
|
|
34
|
+
|
|
35
|
+
```ruby
|
|
36
|
+
require 's3arch'
|
|
37
|
+
|
|
38
|
+
S3arch.configure do |c|
|
|
39
|
+
c.from_env! # Reads S3ARCH_* env vars
|
|
40
|
+
c.source_index = 'UserIndex' # GSI for owner lookup
|
|
41
|
+
c.owner_key = 'userId' # Partition key for owner
|
|
42
|
+
c.searchable_fields = %w[name description tags] # FTS5 indexed fields
|
|
43
|
+
c.metadata_fields = %w[status created_at] # Stored for filtering (not searched)
|
|
44
|
+
c.record_filter = ->(item) { item['status'] == 'active' }
|
|
45
|
+
c.logger = Logger.new($stdout)
|
|
46
|
+
end
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Lambda Handlers
|
|
50
|
+
|
|
51
|
+
**Indexer** (triggered by SQS from DynamoDB Streams):
|
|
52
|
+
|
|
53
|
+
```ruby
|
|
54
|
+
def handler(event:, context:)
|
|
55
|
+
S3arch::Handler.indexer(event)
|
|
56
|
+
end
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
**Searcher** (invoked directly or via API Gateway):
|
|
60
|
+
|
|
61
|
+
```ruby
|
|
62
|
+
def handler(event:, context:)
|
|
63
|
+
result = S3arch::Handler.search(event)
|
|
64
|
+
# => { record_ids: ["id1", "id2", ...], search_mode: "fts5" }
|
|
65
|
+
end
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Search Event Format
|
|
69
|
+
|
|
70
|
+
```json
|
|
71
|
+
{
|
|
72
|
+
"query": "blue chair",
|
|
73
|
+
"owner_ids": ["user-123", "user-456"],
|
|
74
|
+
"filters": { "status": "active" }
|
|
75
|
+
}
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### Manual Indexing
|
|
79
|
+
|
|
80
|
+
```ruby
|
|
81
|
+
indexer = S3arch::Indexer.new
|
|
82
|
+
indexer.rebuild("user-123")
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Environment Variables
|
|
86
|
+
|
|
87
|
+
| Variable | Used By | Description |
|
|
88
|
+
|----------|---------|-------------|
|
|
89
|
+
| `S3ARCH_SOURCE_TABLE` | Indexer | DynamoDB source table name |
|
|
90
|
+
| `S3ARCH_SOURCE_INDEX` | Indexer | GSI name for owner lookup (default: `UserIndex`) |
|
|
91
|
+
| `S3ARCH_INDEX_BUCKET` | Both | S3 bucket for SQLite index files |
|
|
92
|
+
| `S3ARCH_VERSION_TABLE` | Both | DynamoDB version tracking table |
|
|
93
|
+
|
|
94
|
+
## Infrastructure
|
|
95
|
+
|
|
96
|
+
A Terraform module is provided at [`terraform/`](./terraform/) that provisions:
|
|
97
|
+
|
|
98
|
+
- S3 bucket for index storage (with 90-day lifecycle)
|
|
99
|
+
- DynamoDB version tracking table (PAY_PER_REQUEST)
|
|
100
|
+
- SQS queue + DLQ for the indexer
|
|
101
|
+
- EventBridge Pipe (DynamoDB Stream → SQS)
|
|
102
|
+
|
|
103
|
+
```hcl
|
|
104
|
+
module "s3arch" {
|
|
105
|
+
source = "github.com/stowzilla/s3arch//terraform"
|
|
106
|
+
app_name = "myapp"
|
|
107
|
+
environment = "production"
|
|
108
|
+
source_table_name = aws_dynamodb_table.items.name
|
|
109
|
+
source_table_arn = aws_dynamodb_table.items.arn
|
|
110
|
+
source_table_stream_arn = aws_dynamodb_table.items.stream_arn
|
|
111
|
+
}
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Outputs include `indexer_env_vars`, `searcher_env_vars`, `indexer_permissions`, and `searcher_permissions` for easy Lambda configuration.
|
|
115
|
+
|
|
116
|
+
## Configuration Options
|
|
117
|
+
|
|
118
|
+
| Option | Default | Description |
|
|
119
|
+
|--------|---------|-------------|
|
|
120
|
+
| `source_table` | — | DynamoDB table with source records |
|
|
121
|
+
| `source_index` | — | GSI name for querying by owner |
|
|
122
|
+
| `owner_key` | `"user_id"` | Partition key field for owner lookup |
|
|
123
|
+
| `index_bucket` | — | S3 bucket for SQLite files |
|
|
124
|
+
| `version_table` | — | DynamoDB table for version tracking |
|
|
125
|
+
| `searchable_fields` | `["name", "description"]` | Fields indexed in FTS5 |
|
|
126
|
+
| `metadata_fields` | `["status", "created_at"]` | Fields stored for filtering |
|
|
127
|
+
| `record_filter` | `->(_) { true }` | Proc to filter records during indexing |
|
|
128
|
+
| `owner_extractor` | (extracts from DynamoDB stream image) | Proc to extract owner_id from stream event |
|
|
129
|
+
| `version_ttl` | `30` (seconds) | How long to cache version checks |
|
|
130
|
+
| `max_results` | `50` | Maximum search results returned |
|
|
131
|
+
| `max_cached_dbs` | `20` | Max databases cached in `/tmp` |
|
|
132
|
+
|
|
133
|
+
## How Search Works
|
|
134
|
+
|
|
135
|
+
1. **Query arrives** with `owner_ids` and a search string
|
|
136
|
+
2. For each owner, check DynamoDB version table (cached for `version_ttl` seconds)
|
|
137
|
+
3. If version changed (or first request), download SQLite DB from S3 to `/tmp`
|
|
138
|
+
4. Run FTS5 `MATCH` query with prefix matching (`term*`)
|
|
139
|
+
5. Apply metadata filters, sort by rank, return record IDs
|
|
140
|
+
6. LRU eviction when `/tmp` fills up
|
|
141
|
+
|
|
142
|
+
## Requirements
|
|
143
|
+
|
|
144
|
+
- Ruby >= 3.2
|
|
145
|
+
- AWS Lambda with `/tmp` storage (recommend 2048MB ephemeral)
|
|
146
|
+
- SQLite3 native extension (via Lambda layer)
|
|
147
|
+
- DynamoDB table with streams enabled
|
|
148
|
+
- S3 bucket for index storage
|
|
149
|
+
|
|
150
|
+
## License
|
|
151
|
+
|
|
152
|
+
MIT
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module S3arch
|
|
4
|
+
class Configuration
|
|
5
|
+
attr_accessor :source_table, :source_index, :owner_key, :index_bucket, :version_table,
|
|
6
|
+
:searchable_fields, :metadata_fields, :record_filter, :owner_extractor,
|
|
7
|
+
:logger, :version_ttl, :max_results, :max_cached_dbs, :ephemeral_storage_mb
|
|
8
|
+
|
|
9
|
+
def initialize
|
|
10
|
+
@owner_key = 'user_id'
|
|
11
|
+
@searchable_fields = %w[name description]
|
|
12
|
+
@metadata_fields = %w[status created_at]
|
|
13
|
+
@record_filter = ->(_record) { true }
|
|
14
|
+
@owner_extractor = ->(stream_record) {
|
|
15
|
+
image = stream_record.dig('dynamodb', 'NewImage') || stream_record.dig('dynamodb', 'OldImage') || {}
|
|
16
|
+
image.dig(owner_key, 'S')
|
|
17
|
+
}
|
|
18
|
+
@version_ttl = 30
|
|
19
|
+
@max_results = 50
|
|
20
|
+
@max_cached_dbs = 20
|
|
21
|
+
@ephemeral_storage_mb = 2048
|
|
22
|
+
@logger = nil
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def from_env!
|
|
26
|
+
@source_table = ENV['S3ARCH_SOURCE_TABLE']
|
|
27
|
+
@source_index = ENV['S3ARCH_SOURCE_INDEX'] || 'UserIndex'
|
|
28
|
+
@index_bucket = ENV['S3ARCH_INDEX_BUCKET']
|
|
29
|
+
@version_table = ENV['S3ARCH_VERSION_TABLE']
|
|
30
|
+
self
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def validate!
|
|
34
|
+
missing = []
|
|
35
|
+
missing << 'source_table' unless source_table
|
|
36
|
+
missing << 'index_bucket' unless index_bucket
|
|
37
|
+
missing << 'version_table' unless version_table
|
|
38
|
+
missing << 'source_index' unless source_index
|
|
39
|
+
raise Error, "S3arch configuration missing: #{missing.join(', ')}" if missing.any?
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module S3arch
|
|
4
|
+
module Handler
|
|
5
|
+
class << self
|
|
6
|
+
def indexer(event)
|
|
7
|
+
S3arch::Indexer.new.process_event(event)
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def search(event)
|
|
11
|
+
query = event['query']
|
|
12
|
+
owner_ids = event['owner_ids'] || []
|
|
13
|
+
filters = event['filters'] || {}
|
|
14
|
+
|
|
15
|
+
return { record_ids: [], search_mode: 'fts5' } if query.nil? || owner_ids.empty?
|
|
16
|
+
|
|
17
|
+
searcher = S3arch::Searcher.new
|
|
18
|
+
result = searcher.search(query: query, owner_ids: owner_ids, filters: filters)
|
|
19
|
+
|
|
20
|
+
result || { record_ids: nil, search_mode: nil }
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'aws-sdk-dynamodb'
|
|
4
|
+
require 'aws-sdk-s3'
|
|
5
|
+
require 'sqlite3'
|
|
6
|
+
require 'json'
|
|
7
|
+
|
|
8
|
+
module S3arch
|
|
9
|
+
class Indexer
|
|
10
|
+
def initialize(config: S3arch.configuration)
|
|
11
|
+
config.validate!
|
|
12
|
+
@config = config
|
|
13
|
+
@dynamodb = Aws::DynamoDB::Client.new
|
|
14
|
+
@s3 = Aws::S3::Client.new
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def rebuild(owner_id)
|
|
18
|
+
records = fetch_records(owner_id)
|
|
19
|
+
db_path = "/tmp/s3arch_#{owner_id}.sqlite3"
|
|
20
|
+
|
|
21
|
+
build_database(db_path, records)
|
|
22
|
+
upload(owner_id, db_path)
|
|
23
|
+
increment_version(owner_id, records.size)
|
|
24
|
+
|
|
25
|
+
log(:info, 'Index rebuilt', owner_id: owner_id, record_count: records.size)
|
|
26
|
+
ensure
|
|
27
|
+
File.delete(db_path) if db_path && File.exist?(db_path)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def process_event(event)
|
|
31
|
+
records = event['Records'] || []
|
|
32
|
+
owner_ids = records.filter_map { |r|
|
|
33
|
+
body = JSON.parse(r['body'])
|
|
34
|
+
@config.owner_extractor.call(body)
|
|
35
|
+
}.uniq
|
|
36
|
+
|
|
37
|
+
log(:info, 'Rebuilding indexes', owner_ids: owner_ids, record_count: records.size)
|
|
38
|
+
owner_ids.each { |id| rebuild(id) }
|
|
39
|
+
{ statusCode: 200, body: JSON.generate(rebuilt: owner_ids.size) }
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
private
|
|
43
|
+
|
|
44
|
+
def fetch_records(owner_id)
|
|
45
|
+
records = []
|
|
46
|
+
params = { table_name: @config.source_table, index_name: @config.source_index,
|
|
47
|
+
key_condition_expression: "#{@config.owner_key} = :owner",
|
|
48
|
+
expression_attribute_values: { ':owner' => owner_id } }
|
|
49
|
+
|
|
50
|
+
loop do
|
|
51
|
+
result = @dynamodb.query(params)
|
|
52
|
+
result.items.each { |item| records << item if @config.record_filter.call(item) }
|
|
53
|
+
break unless result.last_evaluated_key
|
|
54
|
+
params[:exclusive_start_key] = result.last_evaluated_key
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
records
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def build_database(db_path, records)
|
|
61
|
+
File.delete(db_path) if File.exist?(db_path)
|
|
62
|
+
db = SQLite3::Database.new(db_path)
|
|
63
|
+
|
|
64
|
+
fts_cols = @config.searchable_fields.join(', ')
|
|
65
|
+
meta_cols = (['rowid INTEGER PRIMARY KEY', 'record_id TEXT NOT NULL'] +
|
|
66
|
+
@config.metadata_fields.map { |f| "#{f} TEXT" }).join(', ')
|
|
67
|
+
|
|
68
|
+
db.execute_batch(<<~SQL)
|
|
69
|
+
CREATE VIRTUAL TABLE records_fts USING fts5(#{fts_cols}, content='');
|
|
70
|
+
CREATE TABLE records_meta (#{meta_cols});
|
|
71
|
+
SQL
|
|
72
|
+
|
|
73
|
+
db.transaction do
|
|
74
|
+
records.each_with_index do |record, idx|
|
|
75
|
+
rowid = idx + 1
|
|
76
|
+
fts_values = @config.searchable_fields.map { |f| normalize_field(record[f]) }
|
|
77
|
+
db.execute("INSERT INTO records_fts(rowid, #{fts_cols}) VALUES (#{(['?'] * (fts_values.size + 1)).join(', ')})",
|
|
78
|
+
[rowid] + fts_values)
|
|
79
|
+
|
|
80
|
+
meta_values = [rowid, normalize_field(record['id'])] + @config.metadata_fields.map { |f| normalize_field(record[f]) }
|
|
81
|
+
placeholders = (['?'] * meta_values.size).join(', ')
|
|
82
|
+
db.execute("INSERT INTO records_meta(rowid, record_id, #{@config.metadata_fields.join(', ')}) VALUES (#{placeholders})",
|
|
83
|
+
meta_values)
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
db.close
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def normalize_field(value)
|
|
91
|
+
case value
|
|
92
|
+
when Hash
|
|
93
|
+
if value.key?('S') then value['S'].to_s
|
|
94
|
+
elsif value.key?('L') then value['L'].map { |v| v['S'] || v.to_s }.join(' ')
|
|
95
|
+
elsif value.key?('N') then value['N'].to_s
|
|
96
|
+
elsif value.key?('SS') then value['SS'].join(' ')
|
|
97
|
+
else value.values.first.to_s
|
|
98
|
+
end
|
|
99
|
+
when Array then value.map { |v| v.is_a?(Hash) ? (v['S'] || v.to_s) : v.to_s }.join(' ')
|
|
100
|
+
when nil then ''
|
|
101
|
+
else value.to_s
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def upload(owner_id, db_path)
|
|
106
|
+
@s3.put_object(bucket: @config.index_bucket, key: "#{owner_id}/index.sqlite3", body: File.open(db_path, 'rb'))
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def increment_version(owner_id, record_count)
|
|
110
|
+
@dynamodb.update_item(
|
|
111
|
+
table_name: @config.version_table,
|
|
112
|
+
key: { @config.owner_key => owner_id },
|
|
113
|
+
update_expression: 'SET version = if_not_exists(version, :zero) + :one, updated_at = :now, record_count = :count',
|
|
114
|
+
expression_attribute_values: { ':zero' => 0, ':one' => 1, ':now' => Time.now.iso8601, ':count' => record_count }
|
|
115
|
+
)
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
def log(level, message, **data)
|
|
119
|
+
return unless @config.logger
|
|
120
|
+
@config.logger.send(level, message, **data)
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
end
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'aws-sdk-dynamodb'
|
|
4
|
+
require 'aws-sdk-s3'
|
|
5
|
+
require 'sqlite3'
|
|
6
|
+
|
|
7
|
+
module S3arch
|
|
8
|
+
class Searcher
|
|
9
|
+
@version_cache = {}
|
|
10
|
+
@db_cache = {}
|
|
11
|
+
|
|
12
|
+
class << self
|
|
13
|
+
attr_accessor :version_cache, :db_cache
|
|
14
|
+
|
|
15
|
+
def reset!
|
|
16
|
+
@version_cache = {}
|
|
17
|
+
@db_cache = {}
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def initialize(config: S3arch.configuration)
|
|
22
|
+
config.validate!
|
|
23
|
+
@config = config
|
|
24
|
+
@dynamodb = Aws::DynamoDB::Client.new
|
|
25
|
+
@s3 = Aws::S3::Client.new
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def search(query:, owner_ids:, filters: {})
|
|
29
|
+
return nil if owner_ids.empty? || query.nil? || query.strip.empty?
|
|
30
|
+
|
|
31
|
+
log(:info, 'Search started', query: query, owner_ids: owner_ids, filters: filters)
|
|
32
|
+
|
|
33
|
+
results = []
|
|
34
|
+
owner_ids.each do |owner_id|
|
|
35
|
+
db = ensure_database(owner_id)
|
|
36
|
+
unless db
|
|
37
|
+
log(:info, 'No database available', owner_id: owner_id)
|
|
38
|
+
next
|
|
39
|
+
end
|
|
40
|
+
hits = query_fts(db, query, filters: filters)
|
|
41
|
+
log(:info, 'Owner search complete', owner_id: owner_id, hits: hits.size)
|
|
42
|
+
results.concat(hits)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
log(:info, 'Search complete', total_results: results.size)
|
|
46
|
+
return nil if results.empty?
|
|
47
|
+
|
|
48
|
+
results.sort_by! { |r| r[:rank] }
|
|
49
|
+
{ record_ids: results.first(@config.max_results).map { |r| r[:record_id] }, search_mode: 'fts5' }
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
private
|
|
53
|
+
|
|
54
|
+
def ensure_database(owner_id)
|
|
55
|
+
current_version = fetch_version(owner_id)
|
|
56
|
+
return nil unless current_version
|
|
57
|
+
|
|
58
|
+
cached = self.class.db_cache[owner_id]
|
|
59
|
+
if cached && cached[:version] == current_version
|
|
60
|
+
cached[:last_used] = Time.now
|
|
61
|
+
return cached[:db]
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
log(:info, 'Downloading database from S3', owner_id: owner_id, version: current_version)
|
|
65
|
+
db = download_database(owner_id)
|
|
66
|
+
return nil unless db
|
|
67
|
+
|
|
68
|
+
evict_lru if self.class.db_cache.size >= @config.max_cached_dbs
|
|
69
|
+
self.class.db_cache[owner_id] = { db: db, version: current_version, last_used: Time.now }
|
|
70
|
+
db
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def fetch_version(owner_id)
|
|
74
|
+
cached = self.class.version_cache[owner_id]
|
|
75
|
+
if cached && (Time.now - cached[:checked_at]) < @config.version_ttl
|
|
76
|
+
return cached[:version]
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
result = @dynamodb.get_item(table_name: @config.version_table,
|
|
80
|
+
key: { @config.owner_key => owner_id },
|
|
81
|
+
projection_expression: 'version')
|
|
82
|
+
version = result.item&.dig('version')
|
|
83
|
+
self.class.version_cache[owner_id] = { version: version, checked_at: Time.now } if version
|
|
84
|
+
version
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def download_database(owner_id)
|
|
88
|
+
db_path = "/tmp/s3arch_#{owner_id}.sqlite3"
|
|
89
|
+
@s3.get_object(bucket: @config.index_bucket, key: "#{owner_id}/index.sqlite3",
|
|
90
|
+
response_target: db_path)
|
|
91
|
+
db = SQLite3::Database.new(db_path)
|
|
92
|
+
db.results_as_hash = true
|
|
93
|
+
db
|
|
94
|
+
rescue Aws::S3::Errors::NoSuchKey
|
|
95
|
+
nil
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def query_fts(db, query, filters: {})
|
|
99
|
+
terms = query.strip.split(/\s+/).map { |t| "#{t.gsub('"', '')}*" }
|
|
100
|
+
match_expr = terms.join(' AND ')
|
|
101
|
+
return [] if match_expr.empty?
|
|
102
|
+
|
|
103
|
+
sql = <<~SQL
|
|
104
|
+
SELECT m.record_id, m.*, rank
|
|
105
|
+
FROM records_fts f
|
|
106
|
+
JOIN records_meta m ON m.rowid = f.rowid
|
|
107
|
+
WHERE records_fts MATCH ?
|
|
108
|
+
ORDER BY rank
|
|
109
|
+
LIMIT #{@config.max_results}
|
|
110
|
+
SQL
|
|
111
|
+
|
|
112
|
+
rows = db.execute(sql, [match_expr])
|
|
113
|
+
rows.filter_map do |row|
|
|
114
|
+
next if filters.any? { |field, value| row[field.to_s] != value }
|
|
115
|
+
{ record_id: row['record_id'], rank: row['rank'] }
|
|
116
|
+
end
|
|
117
|
+
rescue SQLite3::Exception => e
|
|
118
|
+
log(:error, 'FTS5 query failed', error: e.message)
|
|
119
|
+
[]
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
def evict_lru
|
|
123
|
+
oldest = self.class.db_cache.min_by { |_, v| v[:last_used] }
|
|
124
|
+
return unless oldest
|
|
125
|
+
|
|
126
|
+
owner_id, entry = oldest
|
|
127
|
+
entry[:db]&.close rescue nil
|
|
128
|
+
File.delete("/tmp/s3arch_#{owner_id}.sqlite3") rescue nil
|
|
129
|
+
self.class.db_cache.delete(owner_id)
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
def log(level, message, **data)
|
|
133
|
+
return unless @config.logger
|
|
134
|
+
@config.logger.send(level, message, **data)
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
end
|
data/lib/s3arch.rb
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 's3arch/version'
|
|
4
|
+
require_relative 's3arch/configuration'
|
|
5
|
+
require_relative 's3arch/indexer'
|
|
6
|
+
require_relative 's3arch/searcher'
|
|
7
|
+
require_relative 's3arch/handler'
|
|
8
|
+
|
|
9
|
+
module S3arch
|
|
10
|
+
class Error < StandardError; end
|
|
11
|
+
|
|
12
|
+
class << self
|
|
13
|
+
def configuration
|
|
14
|
+
@configuration ||= Configuration.new
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def configure
|
|
18
|
+
yield(configuration)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def reset_configuration!
|
|
22
|
+
@configuration = Configuration.new
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: s3arch
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.0.1
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Adam Dalton
|
|
8
|
+
bindir: bin
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: aws-sdk-dynamodb
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - "~>"
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: '1.0'
|
|
19
|
+
type: :runtime
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - "~>"
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: '1.0'
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: aws-sdk-s3
|
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - "~>"
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: '1.0'
|
|
33
|
+
type: :runtime
|
|
34
|
+
prerelease: false
|
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - "~>"
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: '1.0'
|
|
40
|
+
- !ruby/object:Gem::Dependency
|
|
41
|
+
name: sqlite3
|
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
|
43
|
+
requirements:
|
|
44
|
+
- - "~>"
|
|
45
|
+
- !ruby/object:Gem::Version
|
|
46
|
+
version: '2.0'
|
|
47
|
+
type: :runtime
|
|
48
|
+
prerelease: false
|
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - "~>"
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: '2.0'
|
|
54
|
+
description: Per-owner SQLite FTS5 indexes stored on S3, queried from Lambda /tmp.
|
|
55
|
+
DynamoDB source records are indexed via streams, with version tracking and LRU caching.
|
|
56
|
+
email:
|
|
57
|
+
- adam@stowzilla.com
|
|
58
|
+
executables: []
|
|
59
|
+
extensions: []
|
|
60
|
+
extra_rdoc_files: []
|
|
61
|
+
files:
|
|
62
|
+
- CHANGELOG.md
|
|
63
|
+
- LICENSE.txt
|
|
64
|
+
- README.md
|
|
65
|
+
- lib/s3arch.rb
|
|
66
|
+
- lib/s3arch/configuration.rb
|
|
67
|
+
- lib/s3arch/handler.rb
|
|
68
|
+
- lib/s3arch/indexer.rb
|
|
69
|
+
- lib/s3arch/searcher.rb
|
|
70
|
+
- lib/s3arch/version.rb
|
|
71
|
+
homepage: https://github.com/stowzilla/s3arch
|
|
72
|
+
licenses:
|
|
73
|
+
- MIT
|
|
74
|
+
metadata:
|
|
75
|
+
rubygems_mfa_required: 'true'
|
|
76
|
+
homepage_uri: https://github.com/stowzilla/s3arch
|
|
77
|
+
source_code_uri: https://github.com/stowzilla/s3arch
|
|
78
|
+
changelog_uri: https://github.com/stowzilla/s3arch/blob/main/CHANGELOG.md
|
|
79
|
+
rdoc_options: []
|
|
80
|
+
require_paths:
|
|
81
|
+
- lib
|
|
82
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
83
|
+
requirements:
|
|
84
|
+
- - ">="
|
|
85
|
+
- !ruby/object:Gem::Version
|
|
86
|
+
version: '3.2'
|
|
87
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
88
|
+
requirements:
|
|
89
|
+
- - ">="
|
|
90
|
+
- !ruby/object:Gem::Version
|
|
91
|
+
version: '0'
|
|
92
|
+
requirements: []
|
|
93
|
+
rubygems_version: 4.0.11
|
|
94
|
+
specification_version: 4
|
|
95
|
+
summary: SQLite FTS5 full-text search for DynamoDB on AWS Lambda
|
|
96
|
+
test_files: []
|