pii_cipher 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +15 -0
- data/CODE_OF_CONDUCT.md +10 -0
- data/Cargo.lock +394 -0
- data/Cargo.toml +13 -0
- data/LICENSE.txt +21 -0
- data/README.md +302 -0
- data/Rakefile +34 -0
- data/benchmarks/run.rb +203 -0
- data/ext/pii_cipher/Cargo.toml +22 -0
- data/ext/pii_cipher/build.rs +5 -0
- data/ext/pii_cipher/extconf.rb +6 -0
- data/ext/pii_cipher/src/lib.rs +131 -0
- data/lib/pii_cipher/active_record_ext.rb +92 -0
- data/lib/pii_cipher/query_interceptor.rb +73 -0
- data/lib/pii_cipher/railtie.rb +15 -0
- data/lib/pii_cipher/version.rb +5 -0
- data/lib/pii_cipher.rb +51 -0
- data/mise.toml +2 -0
- data/sig/pii_cipher.rbs +16 -0
- metadata +115 -0
data/README.md
ADDED
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
# PiiCipher
|
|
2
|
+
|
|
3
|
+
A Rails gem that enables **searchable blind indexing** for PII fields — powered by a Rust extension for performance.
|
|
4
|
+
|
|
5
|
+
PiiCipher handles the **search layer** of encrypted PII. It is designed to sit alongside Rails' built-in `ActiveRecord::Encryption` (`encrypts :email`), which handles the actual column encryption. Together they give you full GDPR-compliant storage: the real value never touches the database as plaintext, and searching still works.
|
|
6
|
+
|
|
7
|
+
PiiCipher computes HMAC-SHA256 hashes of the plaintext value before it is encrypted, and stores those hashes in a separate column. Queries are rewritten to search the hashes — the ciphertext column is never scanned.
|
|
8
|
+
|
|
9
|
+
Two search modes are supported:
|
|
10
|
+
|
|
11
|
+
| Mode | Column type | Use case |
|
|
12
|
+
|------|-------------|----------|
|
|
13
|
+
| **Partial** (default) | `jsonb` array | `LIKE`-style substring searches (e.g. searching `"smi"` matches `"Smith"`) |
|
|
14
|
+
| **Exact** | `string` | Exact-match lookups (e.g. looking up a full SSN or email) |
|
|
15
|
+
|
|
16
|
+
## How it works
|
|
17
|
+
|
|
18
|
+
### Partial search — trigram blind indexing
|
|
19
|
+
|
|
20
|
+
For partial search, PiiCipher slides a window across the plaintext and HMAC-SHA256s each n-gram using your secret key. The window size defaults to 3 (trigrams) and is configurable per attribute with `gram_size:`:
|
|
21
|
+
|
|
22
|
+
```
|
|
23
|
+
"smith" → ["smi", "mit", "ith"] → [hmac("smi"), hmac("mit"), hmac("ith")]
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
By default values are downcased before hashing, so search is **case-insensitive** (`"smi"` matches `"Smith"`). Set `case_sensitive: true` to opt out.
|
|
27
|
+
|
|
28
|
+
These hashes are stored in a `jsonb` array column. Querying with `where(email: "mit")` generates the same hashes for the search term and uses a PostgreSQL `@>` (contains) check — no plaintext ever touches the database.
|
|
29
|
+
|
|
30
|
+
Partial search is **approximate**: `@>` matches when the stored array contains *all* of the search term's n-gram hashes, which is occasionally satisfied by values that don't actually contain the term as a contiguous substring. Treat it like a fast candidate filter; if you need exact substring semantics, re-filter the returned (decrypted) records in Ruby.
|
|
31
|
+
|
|
32
|
+
### Exact search — single blind index
|
|
33
|
+
|
|
34
|
+
For exact match, a single HMAC-SHA256 of the full value is stored in a regular string column. Querying generates the same hash and does a standard equality check.
|
|
35
|
+
|
|
36
|
+
Both hash functions live in a Rust extension (`magnus` bindings + the `hmac` and `sha2` crates) and are called transparently from Ruby.
|
|
37
|
+
|
|
38
|
+
### Column encryption (the full picture)
|
|
39
|
+
|
|
40
|
+
PiiCipher only generates the blind indexes — it does not encrypt the column itself. Column encryption is handled by Rails AR Encryption (`encrypts`). The two work at different layers and do not interfere:
|
|
41
|
+
|
|
42
|
+
```
|
|
43
|
+
user.save
|
|
44
|
+
├─ before_save (pii_cipher) → reads plaintext → writes hashes to email_bidx_array
|
|
45
|
+
└─ DB write (Rails AR Enc.) → encrypts plaintext → writes ciphertext to email column
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Because Rails AR Encryption works at the DB serialization layer (not a callback), `self.email` always returns plaintext during `before_save` — pii_cipher always hashes the real value, never the ciphertext.
|
|
49
|
+
|
|
50
|
+
## Requirements
|
|
51
|
+
|
|
52
|
+
- Ruby >= 3.1
|
|
53
|
+
- Rails / ActiveRecord >= 7.1 (Active Record Encryption ships in Rails 7.0+)
|
|
54
|
+
- PostgreSQL (partial search relies on the `jsonb` `@>` operator)
|
|
55
|
+
- Rust toolchain (only needed when building the gem from source)
|
|
56
|
+
|
|
57
|
+
## Installation
|
|
58
|
+
|
|
59
|
+
Add to your `Gemfile`:
|
|
60
|
+
|
|
61
|
+
```ruby
|
|
62
|
+
gem "pii_cipher"
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
Then run:
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
bundle install
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Setup
|
|
72
|
+
|
|
73
|
+
### 1. Generate Rails AR Encryption keys
|
|
74
|
+
|
|
75
|
+
Run this once to generate the three keys Rails AR Encryption needs:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
bin/rails db:encryption:init
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Copy the output into your credentials file:
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
bin/rails credentials:edit
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
```yaml
|
|
88
|
+
active_record_encryption:
|
|
89
|
+
primary_key: <generated>
|
|
90
|
+
deterministic_key: <generated>
|
|
91
|
+
key_derivation_salt: <generated>
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
These keys encrypt and decrypt the column values. Keep them in your secrets manager — losing them means losing access to your data.
|
|
95
|
+
|
|
96
|
+
### 2. Set the PiiCipher secret key
|
|
97
|
+
|
|
98
|
+
PiiCipher reads the HMAC key from the `PII_SECRET_KEY` environment variable. Add it to your environment (e.g. via credentials, dotenv, or your secrets manager):
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
PII_SECRET_KEY=your-long-random-secret-here
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
Generate a secure random value with:
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
rails secret
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
Changing this key will invalidate all existing blind indexes.
|
|
111
|
+
|
|
112
|
+
### 3. Add blind index columns
|
|
113
|
+
|
|
114
|
+
For each encrypted attribute, add the corresponding blind index column in a migration.
|
|
115
|
+
|
|
116
|
+
**Partial search** (default — stores trigram hashes in a `jsonb` array):
|
|
117
|
+
|
|
118
|
+
```ruby
|
|
119
|
+
class AddEmailBidxToUsers < ActiveRecord::Migration[8.1]
|
|
120
|
+
def change
|
|
121
|
+
add_column :users, :email_bidx_array, :jsonb
|
|
122
|
+
add_index :users, :email_bidx_array, using: :gin
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
**Exact search** (stores a single hash string):
|
|
128
|
+
|
|
129
|
+
```ruby
|
|
130
|
+
class AddSsnBidxToUsers < ActiveRecord::Migration[8.1]
|
|
131
|
+
def change
|
|
132
|
+
add_column :users, :ssn_bidx, :string
|
|
133
|
+
add_index :users, :ssn_bidx
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
The GIN index on `jsonb` columns is strongly recommended for performance on partial searches.
|
|
139
|
+
|
|
140
|
+
### 4. Declare encrypted attributes in your model
|
|
141
|
+
|
|
142
|
+
Declare `encrypts` (Rails AR Encryption) first, then `use_pii_cipher`. Both must be present for full GDPR-compliant searchable encryption.
|
|
143
|
+
|
|
144
|
+
```ruby
|
|
145
|
+
class User < ApplicationRecord
|
|
146
|
+
encrypts :email # Rails: stores ciphertext in DB, decrypts on read
|
|
147
|
+
use_pii_cipher :email # pii_cipher: generates trigram blind indexes from plaintext
|
|
148
|
+
|
|
149
|
+
encrypts :ssn
|
|
150
|
+
use_pii_cipher :ssn, partial: false # exact-match blind index
|
|
151
|
+
end
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
Multiple attributes can be passed to `use_pii_cipher` in a single call:
|
|
155
|
+
|
|
156
|
+
```ruby
|
|
157
|
+
encrypts :email, :phone_number
|
|
158
|
+
use_pii_cipher :email, :phone_number
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## Usage
|
|
162
|
+
|
|
163
|
+
### Saving records
|
|
164
|
+
|
|
165
|
+
No changes to your existing create/update code. Everything happens automatically:
|
|
166
|
+
|
|
167
|
+
```ruby
|
|
168
|
+
User.create!(email: "alice@example.com", ssn: "123-45-6789")
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
What happens under the hood:
|
|
172
|
+
|
|
173
|
+
1. `before_save` (pii_cipher) reads `"alice@example.com"` as plaintext, generates trigram hashes, writes them to `email_bidx_array`
|
|
174
|
+
2. Rails AR Encryption encrypts `"alice@example.com"` and writes ciphertext to the `email` column
|
|
175
|
+
|
|
176
|
+
### What's in the database vs what Ruby sees
|
|
177
|
+
|
|
178
|
+
```ruby
|
|
179
|
+
user = User.find(1)
|
|
180
|
+
|
|
181
|
+
# Ruby — always decrypted transparently by Rails
|
|
182
|
+
user.email
|
|
183
|
+
# => "alice@example.com"
|
|
184
|
+
|
|
185
|
+
# Raw database row — email column holds ciphertext, blind index holds hashes
|
|
186
|
+
# email => {"p":"Wd5LybiwJGPHYI...","h":{"iv":"XJul...","at":"Pk..."}}
|
|
187
|
+
# email_bidx_array => ["a3f2c1...", "9b4e7d...", ...]
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
Nobody with direct database access can read the email. The blind index is just opaque hashes — it reveals nothing about the original value without the `PII_SECRET_KEY`.
|
|
191
|
+
|
|
192
|
+
### Querying
|
|
193
|
+
|
|
194
|
+
Pass the plaintext value to `where` exactly as you normally would — PiiCipher intercepts encrypted columns and rewrites the query to search the blind index:
|
|
195
|
+
|
|
196
|
+
```ruby
|
|
197
|
+
# Partial search — finds any user whose email contains "alice"
|
|
198
|
+
User.where(email: "alice")
|
|
199
|
+
|
|
200
|
+
# Exact search — finds the user with that exact SSN
|
|
201
|
+
User.where(ssn: "123-45-6789")
|
|
202
|
+
|
|
203
|
+
# Mix encrypted and plain columns freely
|
|
204
|
+
User.where(email: "alice", status: "active")
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
The found records have their emails decrypted by Rails on the way out — callers always receive plaintext. The interceptor only rewrites keys declared with `use_pii_cipher`; all other `where` calls pass through to ActiveRecord unchanged.
|
|
208
|
+
|
|
209
|
+
## Performance
|
|
210
|
+
|
|
211
|
+
Benchmarked on a local machine against PostgreSQL 18 with 100,000 rows. The comparison baseline is a plain (unencrypted) column with a standard index — the closest real-world alternative for each search type.
|
|
212
|
+
|
|
213
|
+
### Writes
|
|
214
|
+
|
|
215
|
+
| | Time (100k rows) |
|
|
216
|
+
|---|---|
|
|
217
|
+
| Plain insert | 1,221 ms |
|
|
218
|
+
| Encrypted insert | 2,861 ms (+134%) |
|
|
219
|
+
|
|
220
|
+
The overhead is not from the Rust hashing — that runs in microseconds. It comes from **writing significantly more data per row**: each record gains a `jsonb` array of 64-character HMAC hex strings (one per trigram) and a 64-character blind index string. Both the larger rows and the GIN index maintenance during insert contribute to the slower writes.
|
|
221
|
+
|
|
222
|
+
### Reads
|
|
223
|
+
|
|
224
|
+
| Query type | Plain | Encrypted | Difference |
|
|
225
|
+
|---|---|---|---|
|
|
226
|
+
| Exact match (B-tree) | 0.121 ms | 0.095 ms | ~within noise |
|
|
227
|
+
| Partial match (GIN) | 1.515 ms | 1.865 ms | +23% |
|
|
228
|
+
|
|
229
|
+
**Exact match** is effectively identical. Both paths hit a B-tree index; the lookup cost is the same regardless of what the key looks like.
|
|
230
|
+
|
|
231
|
+
**Partial match** is ~23% slower. The GIN index sizes end up comparable (see below), but PostgreSQL has to parse the `jsonb` array and evaluate the `@>` containment operator on each probe, which adds a small constant overhead that `pg_trgm`'s native GIN operator doesn't pay.
|
|
232
|
+
|
|
233
|
+
### Storage
|
|
234
|
+
|
|
235
|
+
| | Table total | Email index | Name GIN index |
|
|
236
|
+
|---|---|---|---|
|
|
237
|
+
| Plain | 21 MB | 5 MB | 7.2 MB |
|
|
238
|
+
| Encrypted | 89 MB | 12 MB | 7.0 MB |
|
|
239
|
+
|
|
240
|
+
The table is **4.2× larger**. Every stored trigram hash is 64 characters regardless of what the original value looked like — a 5-character name still produces 3 trigrams × 64 chars = 192 bytes of blind index data. At large scale, this is the dominant cost to plan for.
|
|
241
|
+
|
|
242
|
+
The email B-tree index is 2.4× larger for the same reason (64-char hash vs ~25-char email). The name GIN index sizes are nearly identical — HMAC hashes repeat across rows the same way plain trigrams do (same input + same key = same hash), so the GIN posting lists compress similarly.
|
|
243
|
+
|
|
244
|
+
### What this means in practice
|
|
245
|
+
|
|
246
|
+
- **Reads are fast.** Sub-millisecond exact lookups and ~2ms partial searches hold up well even at this row count.
|
|
247
|
+
- **Writes cost more.** If your workload is write-heavy on PII fields, budget for the extra insert time.
|
|
248
|
+
- **Storage is the main tradeoff.** Plan for roughly 4× the table and index footprint compared to an equivalent unencrypted schema.
|
|
249
|
+
|
|
250
|
+
You can reproduce these results yourself:
|
|
251
|
+
|
|
252
|
+
```bash
|
|
253
|
+
ruby -I lib benchmarks/run.rb
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
## Configuration reference
|
|
257
|
+
|
|
258
|
+
`use_pii_cipher(*attributes, partial: true, gram_size: 3, case_sensitive: false)`
|
|
259
|
+
|
|
260
|
+
| Option | Type | Default | Description |
|
|
261
|
+
|--------|------|---------|-------------|
|
|
262
|
+
| `partial` | Boolean | `true` | `true` → n-gram array in `column_bidx_array`; `false` → single hash in `column_bidx` |
|
|
263
|
+
| `gram_size` | Integer | `3` | Sliding-window size for partial search. Ignored when `partial: false`. Changing it invalidates existing indexes. |
|
|
264
|
+
| `case_sensitive` | Boolean | `false` | `false` downcases values before hashing (case-insensitive search). Must match between stored index and queries; changing it invalidates existing indexes. |
|
|
265
|
+
|
|
266
|
+
## Limitations & gotchas
|
|
267
|
+
|
|
268
|
+
- **Query rewriting covers hash-form `where`.** `Model.where(email: "x")`, scopes, and chained relations (`Model.active.where(email: "x")`) are all rewritten. Conditions that don't go through `where(hash)` are **not** rewritten — including `where.not(...)`, raw string/array conditions (`where("email = ?", x)`), `.or(...)` branches, and `find_by` with string SQL. For those, build the blind index yourself with `PiiCipher.generate_ngram_hashes` / `generate_blind_index`.
|
|
269
|
+
- **Partial search is approximate** and may over-match (see "How it works"). Re-filter in Ruby if you need exact substring semantics.
|
|
270
|
+
- **Search terms shorter than `gram_size`** are hashed whole and only match values that were themselves shorter than `gram_size`. Prefer search terms at least `gram_size` characters long.
|
|
271
|
+
- **PostgreSQL only** for partial search — it uses the `jsonb` `@>` containment operator.
|
|
272
|
+
- **Key/option changes invalidate indexes.** Changing `PII_SECRET_KEY`, `gram_size`, or `case_sensitive` means existing blind indexes no longer match; you must re-save affected records to regenerate them.
|
|
273
|
+
|
|
274
|
+
## Development
|
|
275
|
+
|
|
276
|
+
After checking out the repo, run `bin/setup` to install dependencies (this also compiles the Rust extension). Then run the test suite:
|
|
277
|
+
|
|
278
|
+
```bash
|
|
279
|
+
bundle exec rake spec
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
The Ruby specs include a PostgreSQL-backed integration suite (it builds a temporary table and exercises real `@>` queries). Set the standard `PG*` env vars to point at a database, or skip those examples with `bundle exec rspec --tag ~integration`. The Rust extension also has its own unit tests, runnable from `ext/pii_cipher` with `cargo test`.
|
|
283
|
+
|
|
284
|
+
To open an interactive console with the gem loaded:
|
|
285
|
+
|
|
286
|
+
```bash
|
|
287
|
+
bin/console
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
To build and install the gem locally:
|
|
291
|
+
|
|
292
|
+
```bash
|
|
293
|
+
bundle exec rake install
|
|
294
|
+
```
|
|
295
|
+
|
|
296
|
+
## Contributing
|
|
297
|
+
|
|
298
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/selvachezhian/pii_cipher. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](CODE_OF_CONDUCT.md).
|
|
299
|
+
|
|
300
|
+
## License
|
|
301
|
+
|
|
302
|
+
The gem is available as open source under the terms of the [MIT License](LICENSE.txt).
|
data/Rakefile
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "bundler/gem_tasks"
|
|
4
|
+
require "rspec/core/rake_task"
|
|
5
|
+
|
|
6
|
+
RSpec::Core::RakeTask.new(:spec)
|
|
7
|
+
|
|
8
|
+
require "rubocop/rake_task"
|
|
9
|
+
|
|
10
|
+
RuboCop::RakeTask.new
|
|
11
|
+
|
|
12
|
+
require "rb_sys/extensiontask"
|
|
13
|
+
|
|
14
|
+
task build: :compile
|
|
15
|
+
|
|
16
|
+
GEMSPEC = Gem::Specification.load("pii_cipher.gemspec")
|
|
17
|
+
|
|
18
|
+
RbSys::ExtensionTask.new("pii_cipher", GEMSPEC) do |ext|
|
|
19
|
+
ext.lib_dir = "lib/pii_cipher"
|
|
20
|
+
ext.cross_compile = true
|
|
21
|
+
ext.cross_platform = %w[
|
|
22
|
+
x86_64-linux
|
|
23
|
+
aarch64-linux
|
|
24
|
+
x86_64-darwin
|
|
25
|
+
arm64-darwin
|
|
26
|
+
x64-mingw-ucrt
|
|
27
|
+
]
|
|
28
|
+
ext.cross_compiling do |spec|
|
|
29
|
+
# rb_sys is only needed to compile from source; pre-built gems don't need it
|
|
30
|
+
spec.dependencies.reject! { |dep| dep.name == "rb_sys" }
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
task default: %i[compile spec rubocop]
|
data/benchmarks/run.rb
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
require 'active_record'
|
|
2
|
+
require 'benchmark'
|
|
3
|
+
require 'pg'
|
|
4
|
+
|
|
5
|
+
$LOAD_PATH.unshift File.join(__dir__, '..', 'lib')
|
|
6
|
+
require 'pii_cipher'
|
|
7
|
+
|
|
8
|
+
DB = 'pii_cipher_benchmark'
|
|
9
|
+
SECRET = 'benchmark-secret-key-do-not-use-in-prod'
|
|
10
|
+
ROWS = 100_000
|
|
11
|
+
QUERIES = 1_000
|
|
12
|
+
|
|
13
|
+
NAMES = %w[
|
|
14
|
+
Smith Johnson Williams Brown Jones Garcia Miller Davis Wilson Moore
|
|
15
|
+
Taylor Anderson Thomas Jackson White Harris Martin Thompson Chezhian
|
|
16
|
+
Robinson Clark Rodriguez Lewis Lee Walker Hall Allen Young Hernandez
|
|
17
|
+
].freeze
|
|
18
|
+
|
|
19
|
+
DOMAINS = %w[gmail.com yahoo.com outlook.com protonmail.com icloud.com].freeze
|
|
20
|
+
|
|
21
|
+
def rand_name = "#{NAMES.sample}#{rand(9999)}"
|
|
22
|
+
def rand_email = "#{NAMES.sample.downcase}#{rand(9999)}@#{DOMAINS.sample}"
|
|
23
|
+
|
|
24
|
+
# ── DB setup ─────────────────────────────────────────────────────────────────
|
|
25
|
+
|
|
26
|
+
def connect(db = 'postgres')
|
|
27
|
+
ActiveRecord::Base.establish_connection(adapter: 'postgresql', database: db, host: 'localhost')
|
|
28
|
+
ActiveRecord::Base.connection
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
puts "==> Setting up database '#{DB}'..."
|
|
32
|
+
conn = connect('postgres')
|
|
33
|
+
conn.execute("DROP DATABASE IF EXISTS #{DB}")
|
|
34
|
+
conn.execute("CREATE DATABASE #{DB}")
|
|
35
|
+
|
|
36
|
+
conn = connect(DB)
|
|
37
|
+
conn.execute('CREATE EXTENSION IF NOT EXISTS pg_trgm')
|
|
38
|
+
|
|
39
|
+
conn.create_table :plain_users, force: true do |t|
|
|
40
|
+
t.string :name, null: false
|
|
41
|
+
t.string :email, null: false
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
conn.create_table :encrypted_users, force: true do |t|
|
|
45
|
+
t.string :name, null: false
|
|
46
|
+
t.string :email, null: false
|
|
47
|
+
t.jsonb :name_bidx_array
|
|
48
|
+
t.string :email_bidx
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Indexes on plain table
|
|
52
|
+
conn.execute('CREATE INDEX idx_plain_email ON plain_users (email)')
|
|
53
|
+
conn.execute('CREATE INDEX idx_plain_name_trgm ON plain_users USING GIN (name gin_trgm_ops)')
|
|
54
|
+
|
|
55
|
+
# Indexes on encrypted table
|
|
56
|
+
conn.execute('CREATE INDEX idx_enc_email_bidx ON encrypted_users (email_bidx)')
|
|
57
|
+
conn.execute('CREATE INDEX idx_enc_name_bidx_array ON encrypted_users USING GIN (name_bidx_array)')
|
|
58
|
+
|
|
59
|
+
puts "==> Tables and indexes created.\n\n"
|
|
60
|
+
|
|
61
|
+
# ── Seed data (pre-generate so Rust/hash time isn't mixed with AR overhead) ──
|
|
62
|
+
|
|
63
|
+
puts "==> Pre-generating #{ROWS} rows of test data..."
|
|
64
|
+
plain_rows = ROWS.times.map { { name: rand_name, email: rand_email } }
|
|
65
|
+
encrypted_rows = plain_rows.map do |r|
|
|
66
|
+
{
|
|
67
|
+
name: r[:name],
|
|
68
|
+
email: r[:email],
|
|
69
|
+
name_bidx_array: PiiCipher.generate_ngram_hashes(r[:name].downcase, SECRET, 3).to_json,
|
|
70
|
+
email_bidx: PiiCipher.generate_blind_index(r[:email].downcase, SECRET)
|
|
71
|
+
}
|
|
72
|
+
end
|
|
73
|
+
puts "==> Done.\n\n"
|
|
74
|
+
|
|
75
|
+
# ── Benchmark writes ──────────────────────────────────────────────────────────
|
|
76
|
+
|
|
77
|
+
puts "=" * 60
|
|
78
|
+
puts "WRITE BENCHMARK (#{ROWS} rows, batch insert)"
|
|
79
|
+
puts "=" * 60
|
|
80
|
+
|
|
81
|
+
write_results = Benchmark.bm(30) do |x|
|
|
82
|
+
x.report("Plain insert:") do
|
|
83
|
+
plain_rows.each_slice(1000) do |batch|
|
|
84
|
+
conn.execute(
|
|
85
|
+
"INSERT INTO plain_users (name, email) VALUES " +
|
|
86
|
+
batch.map { |r| "('#{conn.quote_string(r[:name])}','#{conn.quote_string(r[:email])}')" }.join(',')
|
|
87
|
+
)
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
x.report("Encrypted insert:") do
|
|
92
|
+
encrypted_rows.each_slice(1000) do |batch|
|
|
93
|
+
conn.execute(
|
|
94
|
+
"INSERT INTO encrypted_users (name, email, name_bidx_array, email_bidx) VALUES " +
|
|
95
|
+
batch.map { |r|
|
|
96
|
+
name_array = r[:name_bidx_array].gsub("'", "''")
|
|
97
|
+
"('#{conn.quote_string(r[:name])}','#{conn.quote_string(r[:email])}'," \
|
|
98
|
+
"'#{name_array}'::jsonb,'#{conn.quote_string(r[:email_bidx])}')"
|
|
99
|
+
}.join(',')
|
|
100
|
+
)
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
plain_write_ms = (write_results[0].real * 1000).round(1)
|
|
106
|
+
enc_write_ms = (write_results[1].real * 1000).round(1)
|
|
107
|
+
write_overhead = ((enc_write_ms - plain_write_ms) / plain_write_ms * 100).round(1)
|
|
108
|
+
|
|
109
|
+
# ── Benchmark reads ───────────────────────────────────────────────────────────
|
|
110
|
+
|
|
111
|
+
# Pick real values that exist in the DB for fair comparison
|
|
112
|
+
sample_plain_row = conn.execute('SELECT name, email FROM plain_users ORDER BY RANDOM() LIMIT 1').first
|
|
113
|
+
exact_email = sample_plain_row['email']
|
|
114
|
+
partial_name_term = sample_plain_row['name'][0, 4] # 4-char prefix
|
|
115
|
+
|
|
116
|
+
exact_email_bidx = PiiCipher.generate_blind_index(exact_email.downcase, SECRET)
|
|
117
|
+
partial_hashes_json = PiiCipher.generate_ngram_hashes(partial_name_term.downcase, SECRET, 3).to_json
|
|
118
|
+
|
|
119
|
+
puts "\n"
|
|
120
|
+
puts "=" * 60
|
|
121
|
+
puts "READ BENCHMARK (#{QUERIES} queries each)"
|
|
122
|
+
puts " Exact search term : #{exact_email}"
|
|
123
|
+
puts " Partial search term: #{partial_name_term}"
|
|
124
|
+
puts "=" * 60
|
|
125
|
+
|
|
126
|
+
read_results = Benchmark.bm(30) do |x|
|
|
127
|
+
x.report("Plain exact (B-tree):") do
|
|
128
|
+
QUERIES.times { conn.execute("SELECT id FROM plain_users WHERE email = '#{conn.quote_string(exact_email)}'") }
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
x.report("Encrypted exact (blind idx):") do
|
|
132
|
+
QUERIES.times { conn.execute("SELECT id FROM encrypted_users WHERE email_bidx = '#{conn.quote_string(exact_email_bidx)}'") }
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
x.report("Plain partial (pg_trgm GIN):") do
|
|
136
|
+
QUERIES.times { conn.execute("SELECT id FROM plain_users WHERE name LIKE '%#{conn.quote_string(partial_name_term)}%'") }
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
x.report("Encrypted partial (bidx GIN):") do
|
|
140
|
+
QUERIES.times { conn.execute("SELECT id FROM encrypted_users WHERE name_bidx_array @> '#{partial_hashes_json}'::jsonb") }
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
plain_exact_ms = (read_results[0].real * 1000 / QUERIES).round(3)
|
|
145
|
+
enc_exact_ms = (read_results[1].real * 1000 / QUERIES).round(3)
|
|
146
|
+
plain_partial_ms = (read_results[2].real * 1000 / QUERIES).round(3)
|
|
147
|
+
enc_partial_ms = (read_results[3].real * 1000 / QUERIES).round(3)
|
|
148
|
+
|
|
149
|
+
exact_overhead = ((enc_exact_ms - plain_exact_ms) / plain_exact_ms * 100).round(1)
|
|
150
|
+
partial_overhead = ((enc_partial_ms - plain_partial_ms) / plain_partial_ms * 100).round(1)
|
|
151
|
+
|
|
152
|
+
# ── Storage sizes ─────────────────────────────────────────────────────────────
|
|
153
|
+
|
|
154
|
+
def table_size(conn, table)
|
|
155
|
+
conn.execute("SELECT pg_size_pretty(pg_total_relation_size('#{table}'))").first['pg_size_pretty']
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def index_size(conn, index)
|
|
159
|
+
result = conn.execute("SELECT pg_size_pretty(pg_relation_size('#{index}'))").first
|
|
160
|
+
result ? result['pg_size_pretty'] : 'n/a'
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
plain_size = table_size(conn, 'plain_users')
|
|
164
|
+
encrypted_size = table_size(conn, 'encrypted_users')
|
|
165
|
+
|
|
166
|
+
plain_email_idx_size = index_size(conn, 'idx_plain_email')
|
|
167
|
+
plain_name_trgm_size = index_size(conn, 'idx_plain_name_trgm')
|
|
168
|
+
enc_email_bidx_size = index_size(conn, 'idx_enc_email_bidx')
|
|
169
|
+
enc_name_bidx_arr_size = index_size(conn, 'idx_enc_name_bidx_array')
|
|
170
|
+
|
|
171
|
+
# ── Print summary ─────────────────────────────────────────────────────────────
|
|
172
|
+
|
|
173
|
+
puts "\n"
|
|
174
|
+
puts "=" * 60
|
|
175
|
+
puts "SUMMARY (#{ROWS} rows)"
|
|
176
|
+
puts "=" * 60
|
|
177
|
+
|
|
178
|
+
puts "\n--- Writes ---"
|
|
179
|
+
puts " Plain insert: #{plain_write_ms} ms total"
|
|
180
|
+
puts " Encrypted insert: #{enc_write_ms} ms total (+#{write_overhead}% overhead)"
|
|
181
|
+
|
|
182
|
+
puts "\n--- Reads (avg per query) ---"
|
|
183
|
+
puts " Exact match"
|
|
184
|
+
puts " Plain (B-tree): #{plain_exact_ms} ms"
|
|
185
|
+
puts " Encrypted (blind idx): #{enc_exact_ms} ms (+#{exact_overhead}% overhead)"
|
|
186
|
+
puts " Partial match"
|
|
187
|
+
puts " Plain (pg_trgm GIN): #{plain_partial_ms} ms"
|
|
188
|
+
puts " Encrypted (bidx GIN): #{enc_partial_ms} ms (+#{partial_overhead}% overhead)"
|
|
189
|
+
|
|
190
|
+
puts "\n--- Storage ---"
|
|
191
|
+
puts " Plain table total: #{plain_size}"
|
|
192
|
+
puts " Encrypted table total: #{encrypted_size}"
|
|
193
|
+
puts ""
|
|
194
|
+
puts " Plain email index (B-tree): #{plain_email_idx_size}"
|
|
195
|
+
puts " Plain name index (pg_trgm GIN): #{plain_name_trgm_size}"
|
|
196
|
+
puts " Encrypted email index (B-tree): #{enc_email_bidx_size}"
|
|
197
|
+
puts " Encrypted name index (GIN): #{enc_name_bidx_arr_size}"
|
|
198
|
+
|
|
199
|
+
# ── Cleanup ───────────────────────────────────────────────────────────────────
|
|
200
|
+
|
|
201
|
+
ActiveRecord::Base.remove_connection
|
|
202
|
+
connect('postgres').execute("DROP DATABASE #{DB}")
|
|
203
|
+
puts "\n==> Benchmark database dropped. Done."
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
[package]
|
|
2
|
+
name = "pii_cipher"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
edition = "2021"
|
|
5
|
+
authors = ["Selva Chezhian <selvachezhian.labam@gmail.com>"]
|
|
6
|
+
license = "MIT"
|
|
7
|
+
publish = false
|
|
8
|
+
|
|
9
|
+
[lib]
|
|
10
|
+
crate-type = ["cdylib"]
|
|
11
|
+
|
|
12
|
+
[dependencies]
|
|
13
|
+
magnus = { version = "0.8.2" }
|
|
14
|
+
rb-sys = { version = "0.9", features = ["stable-api-compiled-fallback"] }
|
|
15
|
+
sha2 = "0.10"
|
|
16
|
+
hmac = "0.12"
|
|
17
|
+
|
|
18
|
+
[build-dependencies]
|
|
19
|
+
rb-sys-env = "0.2.2"
|
|
20
|
+
|
|
21
|
+
[dev-dependencies]
|
|
22
|
+
rb-sys-test-helpers = { version = "0.2.2" }
|