solyanka 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- solyanka-0.3.0/.gitignore +14 -0
- solyanka-0.3.0/LICENSE +21 -0
- solyanka-0.3.0/PKG-INFO +271 -0
- solyanka-0.3.0/README.md +237 -0
- solyanka-0.3.0/pyproject.toml +29 -0
- solyanka-0.3.0/solyanka/__init__.py +29 -0
- solyanka-0.3.0/solyanka/pattern_preview.py +578 -0
- solyanka-0.3.0/solyanka/transaction_patterns/__init__.py +5 -0
- solyanka-0.3.0/solyanka/transaction_patterns/data/brazil.yml +1094 -0
- solyanka-0.3.0/solyanka/transaction_patterns/data/colombia.yml +26 -0
- solyanka-0.3.0/solyanka/transaction_patterns/data/eea.yml +122 -0
- solyanka-0.3.0/solyanka/transaction_patterns/data/france.yml +19 -0
- solyanka-0.3.0/solyanka/transaction_patterns/data/general.yml +1480 -0
- solyanka-0.3.0/solyanka/transaction_patterns/data/georgia.yml +477 -0
- solyanka-0.3.0/solyanka/transaction_patterns/data/germany.yml +238 -0
- solyanka-0.3.0/solyanka/transaction_patterns/data/hungary.yml +119 -0
- solyanka-0.3.0/solyanka/transaction_patterns/data/indonesia.yml +1034 -0
- solyanka-0.3.0/solyanka/transaction_patterns/data/japan.yml +140 -0
- solyanka-0.3.0/solyanka/transaction_patterns/data/laos.yml +27 -0
- solyanka-0.3.0/solyanka/transaction_patterns/data/latvia.yml +181 -0
- solyanka-0.3.0/solyanka/transaction_patterns/data/malaysia.yml +1233 -0
- solyanka-0.3.0/solyanka/transaction_patterns/data/oae.yml +58 -0
- solyanka-0.3.0/solyanka/transaction_patterns/data/oman.yml +28 -0
- solyanka-0.3.0/solyanka/transaction_patterns/data/portugal.yml +425 -0
- solyanka-0.3.0/solyanka/transaction_patterns/data/schema.json +320 -0
- solyanka-0.3.0/solyanka/transaction_patterns/data/spain.yml +322 -0
- solyanka-0.3.0/solyanka/transaction_patterns/data/thailand.yml +9128 -0
- solyanka-0.3.0/solyanka/transaction_patterns/data/turkey.yml +32 -0
- solyanka-0.3.0/solyanka/transaction_patterns/data/uae.yml +412 -0
- solyanka-0.3.0/solyanka/transaction_patterns/data/uk.yml +86 -0
- solyanka-0.3.0/solyanka/transaction_patterns/data/ukraine.yml +14 -0
- solyanka-0.3.0/solyanka/transaction_patterns/data/usa.yml +280 -0
- solyanka-0.3.0/solyanka/transaction_patterns/data/vietnam.yml +2819 -0
- solyanka-0.3.0/solyanka/transaction_patterns/service.py +286 -0
solyanka-0.3.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) kvokka. All rights reserved.
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE
|
solyanka-0.3.0/PKG-INFO
ADDED
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: solyanka
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Transaction pattern utilities and dataset for statement generators
|
|
5
|
+
Author-email: Development Team <dev@company.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) kvokka. All rights reserved.
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE
|
|
27
|
+
License-File: LICENSE
|
|
28
|
+
Requires-Python: >=3.11
|
|
29
|
+
Requires-Dist: pyyaml>=6.0
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: jsonschema>=4.23; extra == 'dev'
|
|
32
|
+
Requires-Dist: pytest>=8.2; extra == 'dev'
|
|
33
|
+
Description-Content-Type: text/markdown
|
|
34
|
+
|
|
35
|
+
# Solyanka
|
|
36
|
+
|
|
37
|
+
Toolkit + dataset for transaction-pattern driven synthetic statements and downstream LLM fine-tuning.
|
|
38
|
+
This package ships the curated YAML files, their schema, and a tiny loader that apps or notebooks can
|
|
39
|
+
use without worrying about file layout.
|
|
40
|
+
|
|
41
|
+
## Install
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
pip install solyanka # consumers
|
|
45
|
+
pip install -e ".[dev]" # local hacking (tests + linters)
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Runtime use
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
from solyanka import PatternsService
|
|
52
|
+
|
|
53
|
+
svc = PatternsService() # auto-discovers packaged data
|
|
54
|
+
general = svc.load_general_patterns()
|
|
55
|
+
eea = svc.load_eea_patterns()
|
|
56
|
+
thailand = svc.load_country_patterns("Thailand")
|
|
57
|
+
|
|
58
|
+
# Recommended helper: general + (EEA) + country
|
|
59
|
+
bundle = svc.get_country_patterns("Germany")
|
|
60
|
+
|
|
61
|
+
# Fine-grained slices (e.g., validation scripts)
|
|
62
|
+
custom = svc.get_patterns(country="Germany", include="general,eea")
|
|
63
|
+
|
|
64
|
+
# API-ready dictionaries
|
|
65
|
+
payload = svc.get_pattern_dicts(country="Spain")
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Override the dataset path (e.g., while editing YAML) via `PatternsService(base_dir=Path("./transaction_patterns"))`
|
|
69
|
+
or the `TRANSACTION_PATTERNS_DIR` environment variable.
|
|
70
|
+
|
|
71
|
+
## Layout
|
|
72
|
+
|
|
73
|
+
- `solyanka/transaction_patterns/data/*.yml` — curated pattern files (`general.yml`, `eea.yml`, `<country>.yml`).
|
|
74
|
+
- `solyanka/transaction_patterns/data/schema.json` — JSON Schema enforced by tests/CI.
|
|
75
|
+
- `solyanka/transaction_patterns/service.py` — public loader API (keep backward compatible).
|
|
76
|
+
- `tests/` — schema regression + loader behaviour.
|
|
77
|
+
|
|
78
|
+
## Field spec
|
|
79
|
+
|
|
80
|
+
### Required fields
|
|
81
|
+
|
|
82
|
+
| Field | Meaning |
|
|
83
|
+
|------------------|---------------------------------------------------------------------------------------------------|
|
|
84
|
+
| `title` | Merchant label. Plain string or template object. |
|
|
85
|
+
| `prettyTitle` | Short merchant label for UI use. Strip cities/countries/punctuation manually (no scripts); follow the documented brand rules (Uber/Amazon/Airbnb/Youtube/Godaddy/Myprotein/Zenni/Bolt/Iherb/Lotus, etc.). |
|
|
86
|
+
| `currency` | Uppercase ISO 4217 (EUR, USD, GBP, ...). |
|
|
87
|
+
| `amountRange` | `{min, max}` floats describing the observed local-currency range. Mutually exclusive with `amounts`.|
|
|
88
|
+
| `amounts` | List of specific float amounts (e.g. `[9.99, 19.99]`). Use this **instead** of `amountRange` for fixed price points. |
|
|
89
|
+
| `amountFormat` | Rounding strategy: `n>0` decimals, `0` whole units, `n<0` powers of ten (e.g., `-2` rounds to 100s).|
|
|
90
|
+
| `types` | Non-empty list of lowercase tags (`shopping`, `restaurant`, `transportation`, ...). |
|
|
91
|
+
| `prettyTitle` | Short merchant label for UI use. Strip cities/countries/punctuation manually (no scripts); follow the documented brand rules (Uber/Amazon/Airbnb/Youtube/Godaddy/Myprotein/Zenni/Bolt/Iherb/Lotus, etc.). |
|
|
92
|
+
|
|
93
|
+
### Optional fields
|
|
94
|
+
|
|
95
|
+
| Field | Why / how |
|
|
96
|
+
|-----------------------------|-----------------------------------------------------------------------------------------|
|
|
97
|
+
| `weight` | Relative selection probability. 100 = baseline, 120–150 very common, 50 niche. |
|
|
98
|
+
| `refundProbability` | Chance (0–1) that the generator emits a `CARD_REFUND` for this pattern. |
|
|
99
|
+
| `refundDelayMinHours`/`Max` | Boundaries for automatic refund timing (defaults: 72 / 288 hours). |
|
|
100
|
+
| `numberOfOccurrences` | Global cap per statement (useful for rare, one-off merchants). |
|
|
101
|
+
| `subscriptionFrequencyDays` | Frequency for recurring charges (e.g., 30 for monthly subscriptions). |
|
|
102
|
+
| `country` | Required when using `region`. Valid values defined in `schema.json`. |
|
|
103
|
+
| `region` | Geographic region within a country. Valid values defined in `schema.json`. |
|
|
104
|
+
|
|
105
|
+
## Regions
|
|
106
|
+
|
|
107
|
+
The `region` field allows you to specify the geographic region within a country where a transaction pattern is localized. When using `region`, you must also specify `country` — the JSON schema validates that the region matches the country.
|
|
108
|
+
|
|
109
|
+
Valid countries and their regions are defined in `schema.json` (the `allOf` section with `if/then` rules). The schema is the single source of truth for allowed values.
|
|
110
|
+
|
|
111
|
+
**When to use regions:**
|
|
112
|
+
|
|
113
|
+
- Use `region` only when the transaction clearly belongs to a specific geographic area (e.g., a local restaurant, a regional shop).
|
|
114
|
+
- Do not set `region` for online services, nationwide chains, or when the location is ambiguous.
|
|
115
|
+
- When specifying `region`, you must also set `country` to the matching country name.
|
|
116
|
+
|
|
117
|
+
**Example with region:**
|
|
118
|
+
|
|
119
|
+
```yaml
|
|
120
|
+
- title: "Patong Beach Hotel PHUKET"
|
|
121
|
+
prettyTitle: "Patong Beach Hotel"
|
|
122
|
+
currency: "THB"
|
|
123
|
+
amountRange: {min: 2000, max: 8000}
|
|
124
|
+
amountFormat: 0
|
|
125
|
+
types: ["hotel", "accommodation"]
|
|
126
|
+
country: "Thailand"
|
|
127
|
+
region: "Phuket"
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## Template titles
|
|
131
|
+
|
|
132
|
+
```yaml
|
|
133
|
+
title:
|
|
134
|
+
type: template
|
|
135
|
+
template: "Revolut**{num}* DUBLIN"
|
|
136
|
+
prettyTitle: "Revolut"
|
|
137
|
+
params:
|
|
138
|
+
num:
|
|
139
|
+
generator: random_digits
|
|
140
|
+
length: 4
|
|
141
|
+
zero_pad: true
|
|
142
|
+
globalConstant: true
|
|
143
|
+
transform:
|
|
144
|
+
case: upper
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
### Generators & parameters
|
|
148
|
+
|
|
149
|
+
| Generator | Required params | Optional params | Notes |
|
|
150
|
+
|------------------|-----------------|--------------------------------|--------------------------------------------------|
|
|
151
|
+
| `random_digits` | `length` | `zero_pad` (default `true`) | digits only; zero_pad keeps leading zeroes |
|
|
152
|
+
| `random_alnum` | `length` | `charset` | mix of letters/digits; `charset` restricts symbols. The default is `abcdefghijklmnopqrstuvwxyz0123456789`. |
|
|
153
|
+
| `choice` | `options` | `weights` (same length) | uniform when weights omitted |
|
|
154
|
+
|
|
155
|
+
Extras:
|
|
156
|
+
|
|
157
|
+
- `globalConstant: true` — reuse the same generated value across the statement (great for IDs).
|
|
158
|
+
- `transform.case`: `upper`, `lower`, or `title`.
|
|
159
|
+
|
|
160
|
+
## Examples
|
|
161
|
+
|
|
162
|
+
### Simple grocery merchant
|
|
163
|
+
|
|
164
|
+
```yaml
|
|
165
|
+
- title: "Tesco Express"
|
|
166
|
+
prettyTitle: "Tesco Express"
|
|
167
|
+
currency: "GBP"
|
|
168
|
+
amountRange: {min: 5.0, max: 50.0}
|
|
169
|
+
amountFormat: 2
|
|
170
|
+
types: ["groceries", "shopping"]
|
|
171
|
+
weight: 120
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
### Subscription service
|
|
175
|
+
|
|
176
|
+
```yaml
|
|
177
|
+
- title: "Netflix.com"
|
|
178
|
+
prettyTitle: "Netflix"
|
|
179
|
+
currency: "EUR"
|
|
180
|
+
amountRange: {min: 13.49, max: 13.49}
|
|
181
|
+
amountFormat: 2
|
|
182
|
+
subscriptionFrequencyDays: 30
|
|
183
|
+
numberOfOccurrences: 10
|
|
184
|
+
types: ["entertainment", "subscription"]
|
|
185
|
+
weight: 300
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
### Template with refund metadata
|
|
189
|
+
|
|
190
|
+
```yaml
|
|
191
|
+
- title:
|
|
192
|
+
type: template
|
|
193
|
+
prettyTitle: "Airbnb"
|
|
194
|
+
template: "Airbnb * {code} 662-105-6167"
|
|
195
|
+
params:
|
|
196
|
+
code:
|
|
197
|
+
generator: random_alnum
|
|
198
|
+
length: 12
|
|
199
|
+
charset: "abcdefghijklmnopqrstuvwxyz0123456789"
|
|
200
|
+
transform:
|
|
201
|
+
case: lower
|
|
202
|
+
globalConstant: true
|
|
203
|
+
currency: "USD"
|
|
204
|
+
amountRange: {min: 70.0, max: 900.0}
|
|
205
|
+
amountFormat: 2
|
|
206
|
+
refundProbability: 0.4
|
|
207
|
+
types: ["housing"]
|
|
208
|
+
weight: 700
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
### Exact amounts (Fixed price points)
|
|
212
|
+
|
|
213
|
+
```yaml
|
|
214
|
+
- title: "Spotify Premium"
|
|
215
|
+
prettyTitle: "Spotify"
|
|
216
|
+
currency: "EUR"
|
|
217
|
+
amounts: [4.99, 9.99, 14.99]
|
|
218
|
+
amountFormat: 2
|
|
219
|
+
types: ["subscription", "entertainment"]
|
|
220
|
+
weight: 200
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
## Pattern authoring workflow
|
|
224
|
+
|
|
225
|
+
1. Pick the right file (`general.yml`, `eea.yml`, or `<country>.yml`).
|
|
226
|
+
2. Study existing entries (Thailand’s file is a good reference for tone + “uglified” merchant names).
|
|
227
|
+
3. Choose realistic `amountRange`, `amountFormat`, tags, and weights.
|
|
228
|
+
4. Use templates when merchants expose reference numbers.
|
|
229
|
+
5. Annotate generated blocks with comments (e.g., `# Generated transaction pattern - online food`).
|
|
230
|
+
6. Refresh `prettyTitle` after title tweaks by applying the manual derivation rules (strip city/country noise, drop IDs, canonicalize big brands).
|
|
231
|
+
7. Run `pytest` to validate against `schema.json` before committing/publishing.
|
|
232
|
+
|
|
233
|
+
## Tests & release
|
|
234
|
+
|
|
235
|
+
```bash
|
|
236
|
+
pytest # validates YAML + loader invariants
|
|
237
|
+
python -m build # optional local artifact check
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
- CI: `.github/workflows/ci.yml` runs pytest on push/PR.
|
|
241
|
+
- Release automation: merge PRs into `main` with `major release`, `minor release`, or `patch release` labels to control how `.github/workflows/release-tagger.yml` bumps the version after CI finishes green. No label defaults to a build bump (`v1.2.3` → `v1.2.3.1`, etc.). The workflow updates `pyproject.toml` and tags the commit as `v<version>`.
|
|
242
|
+
- PyPI publish: semantic tags (`vMAJOR.MINOR.PATCH` with optional `.<build_or_label>`) trigger `.github/workflows/release.yml`. The release tagger simply creates the tag, so publishing is entirely driven by tag pushes (manual or automated).
|
|
243
|
+
- Need to generate new country patterns for a task? See `AGENTS.md` for the full enrichment workflow.
|
|
244
|
+
|
|
245
|
+
## Pattern preview workflow
|
|
246
|
+
|
|
247
|
+
Pull requests that touch `solyanka/transaction_patterns/data/**` automatically run
|
|
248
|
+
`.github/workflows/pattern-preview.yml`. The workflow uses `python -m solyanka.pattern_preview`
|
|
249
|
+
to diff the branch against the PR base, synthesize up to three example transactions from the
|
|
250
|
+
touched patterns, and posts a Markdown table comment (including short/pretty titles) back onto the PR so reviewers can eyeball
|
|
251
|
+
the new merchants. Preview-only fixtures live under `tests/pattern_preview/` and are injected
|
|
252
|
+
via the workflow using the `--extra-patterns` flag so they stay separate from the shipped data.
|
|
253
|
+
If the rendered tables grow beyond GitHub’s comment limit, the workflow automatically splits
|
|
254
|
+
the output into sequential comments while keeping each pattern block intact.
|
|
255
|
+
Run the same command locally to preview the output before pushing changes:
|
|
256
|
+
|
|
257
|
+
```bash
|
|
258
|
+
python -m solyanka.pattern_preview \
|
|
259
|
+
--base-ref origin/main \
|
|
260
|
+
--head-ref HEAD \
|
|
261
|
+
--samples-per-pattern 3 \
|
|
262
|
+
--extra-patterns tests/pattern_preview
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
Conventions: keep YAML human-readable (sorted keys, helpful comments), avoid UUID-looking titles, and update schema/tests whenever the structure changes.
|
|
266
|
+
|
|
267
|
+
## Purpose recap
|
|
268
|
+
|
|
269
|
+
Solyanka is the single source of truth for transaction-pattern assets used by the bank-statement
|
|
270
|
+
generator and any LLM training pipelines. Treat it like a dataset project: tight validation, small
|
|
271
|
+
focused API surface, deterministic releases.
|
solyanka-0.3.0/README.md
ADDED
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
# Solyanka
|
|
2
|
+
|
|
3
|
+
Toolkit + dataset for transaction-pattern driven synthetic statements and downstream LLM fine-tuning.
|
|
4
|
+
This package ships the curated YAML files, their schema, and a tiny loader that apps or notebooks can
|
|
5
|
+
use without worrying about file layout.
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install solyanka # consumers
|
|
11
|
+
pip install -e ".[dev]" # local hacking (tests + linters)
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## Runtime use
|
|
15
|
+
|
|
16
|
+
```python
|
|
17
|
+
from solyanka import PatternsService
|
|
18
|
+
|
|
19
|
+
svc = PatternsService() # auto-discovers packaged data
|
|
20
|
+
general = svc.load_general_patterns()
|
|
21
|
+
eea = svc.load_eea_patterns()
|
|
22
|
+
thailand = svc.load_country_patterns("Thailand")
|
|
23
|
+
|
|
24
|
+
# Recommended helper: general + (EEA) + country
|
|
25
|
+
bundle = svc.get_country_patterns("Germany")
|
|
26
|
+
|
|
27
|
+
# Fine-grained slices (e.g., validation scripts)
|
|
28
|
+
custom = svc.get_patterns(country="Germany", include="general,eea")
|
|
29
|
+
|
|
30
|
+
# API-ready dictionaries
|
|
31
|
+
payload = svc.get_pattern_dicts(country="Spain")
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Override the dataset path (e.g., while editing YAML) via `PatternsService(base_dir=Path("./transaction_patterns"))`
|
|
35
|
+
or the `TRANSACTION_PATTERNS_DIR` environment variable.
|
|
36
|
+
|
|
37
|
+
## Layout
|
|
38
|
+
|
|
39
|
+
- `solyanka/transaction_patterns/data/*.yml` — curated pattern files (`general.yml`, `eea.yml`, `<country>.yml`).
|
|
40
|
+
- `solyanka/transaction_patterns/data/schema.json` — JSON Schema enforced by tests/CI.
|
|
41
|
+
- `solyanka/transaction_patterns/service.py` — public loader API (keep backward compatible).
|
|
42
|
+
- `tests/` — schema regression + loader behaviour.
|
|
43
|
+
|
|
44
|
+
## Field spec
|
|
45
|
+
|
|
46
|
+
### Required fields
|
|
47
|
+
|
|
48
|
+
| Field | Meaning |
|
|
49
|
+
|------------------|---------------------------------------------------------------------------------------------------|
|
|
50
|
+
| `title` | Merchant label. Plain string or template object. |
|
|
51
|
+
| `prettyTitle` | Short merchant label for UI use. Strip cities/countries/punctuation manually (no scripts); follow the documented brand rules (Uber/Amazon/Airbnb/Youtube/Godaddy/Myprotein/Zenni/Bolt/Iherb/Lotus, etc.). |
|
|
52
|
+
| `currency` | Uppercase ISO 4217 (EUR, USD, GBP, ...). |
|
|
53
|
+
| `amountRange` | `{min, max}` floats describing the observed local-currency range. Mutually exclusive with `amounts`.|
|
|
54
|
+
| `amounts` | List of specific float amounts (e.g. `[9.99, 19.99]`). Use this **instead** of `amountRange` for fixed price points. |
|
|
55
|
+
| `amountFormat` | Rounding strategy: `n>0` decimals, `0` whole units, `n<0` powers of ten (e.g., `-2` rounds to 100s).|
|
|
56
|
+
| `types` | Non-empty list of lowercase tags (`shopping`, `restaurant`, `transportation`, ...). |
|
|
57
|
+
| `prettyTitle` | Short merchant label for UI use. Strip cities/countries/punctuation manually (no scripts); follow the documented brand rules (Uber/Amazon/Airbnb/Youtube/Godaddy/Myprotein/Zenni/Bolt/Iherb/Lotus, etc.). |
|
|
58
|
+
|
|
59
|
+
### Optional fields
|
|
60
|
+
|
|
61
|
+
| Field | Why / how |
|
|
62
|
+
|-----------------------------|-----------------------------------------------------------------------------------------|
|
|
63
|
+
| `weight` | Relative selection probability. 100 = baseline, 120–150 very common, 50 niche. |
|
|
64
|
+
| `refundProbability` | Chance (0–1) that the generator emits a `CARD_REFUND` for this pattern. |
|
|
65
|
+
| `refundDelayMinHours`/`Max` | Boundaries for automatic refund timing (defaults: 72 / 288 hours). |
|
|
66
|
+
| `numberOfOccurrences` | Global cap per statement (useful for rare, one-off merchants). |
|
|
67
|
+
| `subscriptionFrequencyDays` | Frequency for recurring charges (e.g., 30 for monthly subscriptions). |
|
|
68
|
+
| `country` | Required when using `region`. Valid values defined in `schema.json`. |
|
|
69
|
+
| `region` | Geographic region within a country. Valid values defined in `schema.json`. |
|
|
70
|
+
|
|
71
|
+
## Regions
|
|
72
|
+
|
|
73
|
+
The `region` field allows you to specify the geographic region within a country where a transaction pattern is localized. When using `region`, you must also specify `country` — the JSON schema validates that the region matches the country.
|
|
74
|
+
|
|
75
|
+
Valid countries and their regions are defined in `schema.json` (the `allOf` section with `if/then` rules). The schema is the single source of truth for allowed values.
|
|
76
|
+
|
|
77
|
+
**When to use regions:**
|
|
78
|
+
|
|
79
|
+
- Use `region` only when the transaction clearly belongs to a specific geographic area (e.g., a local restaurant, a regional shop).
|
|
80
|
+
- Do not set `region` for online services, nationwide chains, or when the location is ambiguous.
|
|
81
|
+
- When specifying `region`, you must also set `country` to the matching country name.
|
|
82
|
+
|
|
83
|
+
**Example with region:**
|
|
84
|
+
|
|
85
|
+
```yaml
|
|
86
|
+
- title: "Patong Beach Hotel PHUKET"
|
|
87
|
+
prettyTitle: "Patong Beach Hotel"
|
|
88
|
+
currency: "THB"
|
|
89
|
+
amountRange: {min: 2000, max: 8000}
|
|
90
|
+
amountFormat: 0
|
|
91
|
+
types: ["hotel", "accommodation"]
|
|
92
|
+
country: "Thailand"
|
|
93
|
+
region: "Phuket"
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Template titles
|
|
97
|
+
|
|
98
|
+
```yaml
|
|
99
|
+
title:
|
|
100
|
+
type: template
|
|
101
|
+
template: "Revolut**{num}* DUBLIN"
|
|
102
|
+
prettyTitle: "Revolut"
|
|
103
|
+
params:
|
|
104
|
+
num:
|
|
105
|
+
generator: random_digits
|
|
106
|
+
length: 4
|
|
107
|
+
zero_pad: true
|
|
108
|
+
globalConstant: true
|
|
109
|
+
transform:
|
|
110
|
+
case: upper
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### Generators & parameters
|
|
114
|
+
|
|
115
|
+
| Generator | Required params | Optional params | Notes |
|
|
116
|
+
|------------------|-----------------|--------------------------------|--------------------------------------------------|
|
|
117
|
+
| `random_digits` | `length` | `zero_pad` (default `true`) | digits only; zero_pad keeps leading zeroes |
|
|
118
|
+
| `random_alnum` | `length` | `charset` | mix of letters/digits; `charset` restricts symbols. The default is `abcdefghijklmnopqrstuvwxyz0123456789`. |
|
|
119
|
+
| `choice` | `options` | `weights` (same length) | uniform when weights omitted |
|
|
120
|
+
|
|
121
|
+
Extras:
|
|
122
|
+
|
|
123
|
+
- `globalConstant: true` — reuse the same generated value across the statement (great for IDs).
|
|
124
|
+
- `transform.case`: `upper`, `lower`, or `title`.
|
|
125
|
+
|
|
126
|
+
## Examples
|
|
127
|
+
|
|
128
|
+
### Simple grocery merchant
|
|
129
|
+
|
|
130
|
+
```yaml
|
|
131
|
+
- title: "Tesco Express"
|
|
132
|
+
prettyTitle: "Tesco Express"
|
|
133
|
+
currency: "GBP"
|
|
134
|
+
amountRange: {min: 5.0, max: 50.0}
|
|
135
|
+
amountFormat: 2
|
|
136
|
+
types: ["groceries", "shopping"]
|
|
137
|
+
weight: 120
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
### Subscription service
|
|
141
|
+
|
|
142
|
+
```yaml
|
|
143
|
+
- title: "Netflix.com"
|
|
144
|
+
prettyTitle: "Netflix"
|
|
145
|
+
currency: "EUR"
|
|
146
|
+
amountRange: {min: 13.49, max: 13.49}
|
|
147
|
+
amountFormat: 2
|
|
148
|
+
subscriptionFrequencyDays: 30
|
|
149
|
+
numberOfOccurrences: 10
|
|
150
|
+
types: ["entertainment", "subscription"]
|
|
151
|
+
weight: 300
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
### Template with refund metadata
|
|
155
|
+
|
|
156
|
+
```yaml
|
|
157
|
+
- title:
|
|
158
|
+
type: template
|
|
159
|
+
prettyTitle: "Airbnb"
|
|
160
|
+
template: "Airbnb * {code} 662-105-6167"
|
|
161
|
+
params:
|
|
162
|
+
code:
|
|
163
|
+
generator: random_alnum
|
|
164
|
+
length: 12
|
|
165
|
+
charset: "abcdefghijklmnopqrstuvwxyz0123456789"
|
|
166
|
+
transform:
|
|
167
|
+
case: lower
|
|
168
|
+
globalConstant: true
|
|
169
|
+
currency: "USD"
|
|
170
|
+
amountRange: {min: 70.0, max: 900.0}
|
|
171
|
+
amountFormat: 2
|
|
172
|
+
refundProbability: 0.4
|
|
173
|
+
types: ["housing"]
|
|
174
|
+
weight: 700
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### Exact amounts (Fixed price points)
|
|
178
|
+
|
|
179
|
+
```yaml
|
|
180
|
+
- title: "Spotify Premium"
|
|
181
|
+
prettyTitle: "Spotify"
|
|
182
|
+
currency: "EUR"
|
|
183
|
+
amounts: [4.99, 9.99, 14.99]
|
|
184
|
+
amountFormat: 2
|
|
185
|
+
types: ["subscription", "entertainment"]
|
|
186
|
+
weight: 200
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
## Pattern authoring workflow
|
|
190
|
+
|
|
191
|
+
1. Pick the right file (`general.yml`, `eea.yml`, or `<country>.yml`).
|
|
192
|
+
2. Study existing entries (Thailand’s file is a good reference for tone + “uglified” merchant names).
|
|
193
|
+
3. Choose realistic `amountRange`, `amountFormat`, tags, and weights.
|
|
194
|
+
4. Use templates when merchants expose reference numbers.
|
|
195
|
+
5. Annotate generated blocks with comments (e.g., `# Generated transaction pattern - online food`).
|
|
196
|
+
6. Refresh `prettyTitle` after title tweaks by applying the manual derivation rules (strip city/country noise, drop IDs, canonicalize big brands).
|
|
197
|
+
7. Run `pytest` to validate against `schema.json` before committing/publishing.
|
|
198
|
+
|
|
199
|
+
## Tests & release
|
|
200
|
+
|
|
201
|
+
```bash
|
|
202
|
+
pytest # validates YAML + loader invariants
|
|
203
|
+
python -m build # optional local artifact check
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
- CI: `.github/workflows/ci.yml` runs pytest on push/PR.
|
|
207
|
+
- Release automation: merge PRs into `main` with `major release`, `minor release`, or `patch release` labels to control how `.github/workflows/release-tagger.yml` bumps the version after CI finishes green. No label defaults to a build bump (`v1.2.3` → `v1.2.3.1`, etc.). The workflow updates `pyproject.toml` and tags the commit as `v<version>`.
|
|
208
|
+
- PyPI publish: semantic tags (`vMAJOR.MINOR.PATCH` with optional `.<build_or_label>`) trigger `.github/workflows/release.yml`. The release tagger simply creates the tag, so publishing is entirely driven by tag pushes (manual or automated).
|
|
209
|
+
- Need to generate new country patterns for a task? See `AGENTS.md` for the full enrichment workflow.
|
|
210
|
+
|
|
211
|
+
## Pattern preview workflow
|
|
212
|
+
|
|
213
|
+
Pull requests that touch `solyanka/transaction_patterns/data/**` automatically run
|
|
214
|
+
`.github/workflows/pattern-preview.yml`. The workflow uses `python -m solyanka.pattern_preview`
|
|
215
|
+
to diff the branch against the PR base, synthesize up to three example transactions from the
|
|
216
|
+
touched patterns, and posts a Markdown table comment (including short/pretty titles) back onto the PR so reviewers can eyeball
|
|
217
|
+
the new merchants. Preview-only fixtures live under `tests/pattern_preview/` and are injected
|
|
218
|
+
via the workflow using the `--extra-patterns` flag so they stay separate from the shipped data.
|
|
219
|
+
If the rendered tables grow beyond GitHub’s comment limit, the workflow automatically splits
|
|
220
|
+
the output into sequential comments while keeping each pattern block intact.
|
|
221
|
+
Run the same command locally to preview the output before pushing changes:
|
|
222
|
+
|
|
223
|
+
```bash
|
|
224
|
+
python -m solyanka.pattern_preview \
|
|
225
|
+
--base-ref origin/main \
|
|
226
|
+
--head-ref HEAD \
|
|
227
|
+
--samples-per-pattern 3 \
|
|
228
|
+
--extra-patterns tests/pattern_preview
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
Conventions: keep YAML human-readable (sorted keys, helpful comments), avoid UUID-looking titles, and update schema/tests whenever the structure changes.
|
|
232
|
+
|
|
233
|
+
## Purpose recap
|
|
234
|
+
|
|
235
|
+
Solyanka is the single source of truth for transaction-pattern assets used by the bank-statement
|
|
236
|
+
generator and any LLM training pipelines. Treat it like a dataset project: tight validation, small
|
|
237
|
+
focused API surface, deterministic releases.
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "solyanka"
|
|
3
|
+
version = "0.3.0"
|
|
4
|
+
description = "Transaction pattern utilities and dataset for statement generators"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.11"
|
|
7
|
+
license = { file = "LICENSE" }
|
|
8
|
+
authors = [{ name = "Development Team", email = "dev@company.com" }]
|
|
9
|
+
dependencies = ["pyyaml>=6.0"]
|
|
10
|
+
|
|
11
|
+
[project.optional-dependencies]
|
|
12
|
+
dev = ["pytest>=8.2", "jsonschema>=4.23"]
|
|
13
|
+
|
|
14
|
+
[build-system]
|
|
15
|
+
requires = ["hatchling"]
|
|
16
|
+
build-backend = "hatchling.build"
|
|
17
|
+
|
|
18
|
+
[tool.hatch.build]
|
|
19
|
+
include = ["solyanka", "README.md", "LICENSE"]
|
|
20
|
+
|
|
21
|
+
[tool.hatch.build.targets.wheel]
|
|
22
|
+
packages = ["solyanka"]
|
|
23
|
+
|
|
24
|
+
[tool.pytest.ini_options]
|
|
25
|
+
addopts = "-ra --strict-markers --strict-config"
|
|
26
|
+
testpaths = ["tests"]
|
|
27
|
+
markers = [
|
|
28
|
+
"transaction_patterns: validate transaction pattern YAML files against the schema",
|
|
29
|
+
]
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Solyanka – lightweight helpers for statement generation tooling."""
|
|
2
|
+
|
|
3
|
+
from importlib import metadata as importlib_metadata
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import tomllib
|
|
7
|
+
|
|
8
|
+
from .transaction_patterns.service import EEA_COUNTRIES, Pattern, PatternsService
|
|
9
|
+
|
|
10
|
+
__all__ = ["EEA_COUNTRIES", "Pattern", "PatternsService"]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _read_local_version() -> str:
|
|
14
|
+
root = Path(__file__).resolve().parent.parent
|
|
15
|
+
pyproject = root / "pyproject.toml"
|
|
16
|
+
if not pyproject.exists():
|
|
17
|
+
return "0.0.0"
|
|
18
|
+
data = tomllib.loads(pyproject.read_text(encoding="utf-8"))
|
|
19
|
+
project = data.get("project", {})
|
|
20
|
+
version = project.get("version")
|
|
21
|
+
if isinstance(version, str):
|
|
22
|
+
return version
|
|
23
|
+
return "0.0.0"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
__version__ = importlib_metadata.version("solyanka")
|
|
28
|
+
except importlib_metadata.PackageNotFoundError: # pragma: no cover
|
|
29
|
+
__version__ = _read_local_version()
|