solyanka 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. solyanka-0.3.0/.gitignore +14 -0
  2. solyanka-0.3.0/LICENSE +21 -0
  3. solyanka-0.3.0/PKG-INFO +271 -0
  4. solyanka-0.3.0/README.md +237 -0
  5. solyanka-0.3.0/pyproject.toml +29 -0
  6. solyanka-0.3.0/solyanka/__init__.py +29 -0
  7. solyanka-0.3.0/solyanka/pattern_preview.py +578 -0
  8. solyanka-0.3.0/solyanka/transaction_patterns/__init__.py +5 -0
  9. solyanka-0.3.0/solyanka/transaction_patterns/data/brazil.yml +1094 -0
  10. solyanka-0.3.0/solyanka/transaction_patterns/data/colombia.yml +26 -0
  11. solyanka-0.3.0/solyanka/transaction_patterns/data/eea.yml +122 -0
  12. solyanka-0.3.0/solyanka/transaction_patterns/data/france.yml +19 -0
  13. solyanka-0.3.0/solyanka/transaction_patterns/data/general.yml +1480 -0
  14. solyanka-0.3.0/solyanka/transaction_patterns/data/georgia.yml +477 -0
  15. solyanka-0.3.0/solyanka/transaction_patterns/data/germany.yml +238 -0
  16. solyanka-0.3.0/solyanka/transaction_patterns/data/hungary.yml +119 -0
  17. solyanka-0.3.0/solyanka/transaction_patterns/data/indonesia.yml +1034 -0
  18. solyanka-0.3.0/solyanka/transaction_patterns/data/japan.yml +140 -0
  19. solyanka-0.3.0/solyanka/transaction_patterns/data/laos.yml +27 -0
  20. solyanka-0.3.0/solyanka/transaction_patterns/data/latvia.yml +181 -0
  21. solyanka-0.3.0/solyanka/transaction_patterns/data/malaysia.yml +1233 -0
  22. solyanka-0.3.0/solyanka/transaction_patterns/data/oae.yml +58 -0
  23. solyanka-0.3.0/solyanka/transaction_patterns/data/oman.yml +28 -0
  24. solyanka-0.3.0/solyanka/transaction_patterns/data/portugal.yml +425 -0
  25. solyanka-0.3.0/solyanka/transaction_patterns/data/schema.json +320 -0
  26. solyanka-0.3.0/solyanka/transaction_patterns/data/spain.yml +322 -0
  27. solyanka-0.3.0/solyanka/transaction_patterns/data/thailand.yml +9128 -0
  28. solyanka-0.3.0/solyanka/transaction_patterns/data/turkey.yml +32 -0
  29. solyanka-0.3.0/solyanka/transaction_patterns/data/uae.yml +412 -0
  30. solyanka-0.3.0/solyanka/transaction_patterns/data/uk.yml +86 -0
  31. solyanka-0.3.0/solyanka/transaction_patterns/data/ukraine.yml +14 -0
  32. solyanka-0.3.0/solyanka/transaction_patterns/data/usa.yml +280 -0
  33. solyanka-0.3.0/solyanka/transaction_patterns/data/vietnam.yml +2819 -0
  34. solyanka-0.3.0/solyanka/transaction_patterns/service.py +286 -0
@@ -0,0 +1,14 @@
1
+ **/__pycache__
2
+ *.pyc
3
+ .pytest_cache/
4
+ .mypy_cache/
5
+ dist/
6
+ build/
7
+ *.egg-info/
8
+ .env
9
+ */coverage/
10
+ logs/*
11
+ !logs/.gitkeep
12
+ **/tmp
13
+ shared
14
+ .worktree/
solyanka-0.3.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) kvokka. All rights reserved.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE
@@ -0,0 +1,271 @@
1
+ Metadata-Version: 2.4
2
+ Name: solyanka
3
+ Version: 0.3.0
4
+ Summary: Transaction pattern utilities and dataset for statement generators
5
+ Author-email: Development Team <dev@company.com>
6
+ License: MIT License
7
+
8
+ Copyright (c) kvokka. All rights reserved.
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE
27
+ License-File: LICENSE
28
+ Requires-Python: >=3.11
29
+ Requires-Dist: pyyaml>=6.0
30
+ Provides-Extra: dev
31
+ Requires-Dist: jsonschema>=4.23; extra == 'dev'
32
+ Requires-Dist: pytest>=8.2; extra == 'dev'
33
+ Description-Content-Type: text/markdown
34
+
35
+ # Solyanka
36
+
37
+ Toolkit + dataset for transaction-pattern driven synthetic statements and downstream LLM fine-tuning.
38
+ This package ships the curated YAML files, their schema, and a tiny loader that apps or notebooks can
39
+ use without worrying about file layout.
40
+
41
+ ## Install
42
+
43
+ ```bash
44
+ pip install solyanka # consumers
45
+ pip install -e ".[dev]" # local hacking (tests + linters)
46
+ ```
47
+
48
+ ## Runtime use
49
+
50
+ ```python
51
+ from solyanka import PatternsService
52
+
53
+ svc = PatternsService() # auto-discovers packaged data
54
+ general = svc.load_general_patterns()
55
+ eea = svc.load_eea_patterns()
56
+ thailand = svc.load_country_patterns("Thailand")
57
+
58
+ # Recommended helper: general + (EEA) + country
59
+ bundle = svc.get_country_patterns("Germany")
60
+
61
+ # Fine-grained slices (e.g., validation scripts)
62
+ custom = svc.get_patterns(country="Germany", include="general,eea")
63
+
64
+ # API-ready dictionaries
65
+ payload = svc.get_pattern_dicts(country="Spain")
66
+ ```
67
+
68
+ Override the dataset path (e.g., while editing YAML) via `PatternsService(base_dir=Path("./transaction_patterns"))`
69
+ or the `TRANSACTION_PATTERNS_DIR` environment variable.
70
+
71
+ ## Layout
72
+
73
+ - `solyanka/transaction_patterns/data/*.yml` — curated pattern files (`general.yml`, `eea.yml`, `<country>.yml`).
74
+ - `solyanka/transaction_patterns/data/schema.json` — JSON Schema enforced by tests/CI.
75
+ - `solyanka/transaction_patterns/service.py` — public loader API (keep backward compatible).
76
+ - `tests/` — schema regression + loader behaviour.
77
+
78
+ ## Field spec
79
+
80
+ ### Required fields
81
+
82
+ | Field | Meaning |
83
+ |------------------|---------------------------------------------------------------------------------------------------|
84
+ | `title` | Merchant label. Plain string or template object. |
85
+ | `prettyTitle` | Short merchant label for UI use. Strip cities/countries/punctuation manually (no scripts); follow the documented brand rules (Uber/Amazon/Airbnb/Youtube/Godaddy/Myprotein/Zenni/Bolt/Iherb/Lotus, etc.). |
86
+ | `currency` | Uppercase ISO 4217 (EUR, USD, GBP, ...). |
87
+ | `amountRange` | `{min, max}` floats describing the observed local-currency range. Mutually exclusive with `amounts`.|
88
+ | `amounts` | List of specific float amounts (e.g. `[9.99, 19.99]`). Use this **instead** of `amountRange` for fixed price points. |
89
+ | `amountFormat` | Rounding strategy: `n>0` decimals, `0` whole units, `n<0` powers of ten (e.g., `-2` rounds to 100s).|
90
+ | `types` | Non-empty list of lowercase tags (`shopping`, `restaurant`, `transportation`, ...). |
91
+ | `prettyTitle` | Short merchant label for UI use. Strip cities/countries/punctuation manually (no scripts); follow the documented brand rules (Uber/Amazon/Airbnb/Youtube/Godaddy/Myprotein/Zenni/Bolt/Iherb/Lotus, etc.). |
92
+
93
+ ### Optional fields
94
+
95
+ | Field | Why / how |
96
+ |-----------------------------|-----------------------------------------------------------------------------------------|
97
+ | `weight` | Relative selection probability. 100 = baseline, 120–150 very common, 50 niche. |
98
+ | `refundProbability` | Chance (0–1) that the generator emits a `CARD_REFUND` for this pattern. |
99
+ | `refundDelayMinHours`/`Max` | Boundaries for automatic refund timing (defaults: 72 / 288 hours). |
100
+ | `numberOfOccurrences` | Global cap per statement (useful for rare, one-off merchants). |
101
+ | `subscriptionFrequencyDays` | Frequency for recurring charges (e.g., 30 for monthly subscriptions). |
102
+ | `country` | Required when using `region`. Valid values defined in `schema.json`. |
103
+ | `region` | Geographic region within a country. Valid values defined in `schema.json`. |
104
+
105
+ ## Regions
106
+
107
+ The `region` field allows you to specify the geographic region within a country where a transaction pattern is localized. When using `region`, you must also specify `country` — the JSON schema validates that the region matches the country.
108
+
109
+ Valid countries and their regions are defined in `schema.json` (the `allOf` section with `if/then` rules). The schema is the single source of truth for allowed values.
110
+
111
+ **When to use regions:**
112
+
113
+ - Use `region` only when the transaction clearly belongs to a specific geographic area (e.g., a local restaurant, a regional shop).
114
+ - Do not set `region` for online services, nationwide chains, or when the location is ambiguous.
115
+ - When specifying `region`, you must also set `country` to the matching country name.
116
+
117
+ **Example with region:**
118
+
119
+ ```yaml
120
+ - title: "Patong Beach Hotel PHUKET"
121
+ prettyTitle: "Patong Beach Hotel"
122
+ currency: "THB"
123
+ amountRange: {min: 2000, max: 8000}
124
+ amountFormat: 0
125
+ types: ["hotel", "accommodation"]
126
+ country: "Thailand"
127
+ region: "Phuket"
128
+ ```
129
+
130
+ ## Template titles
131
+
132
+ ```yaml
133
+ title:
134
+ type: template
135
+ template: "Revolut**{num}* DUBLIN"
136
+ prettyTitle: "Revolut"
137
+ params:
138
+ num:
139
+ generator: random_digits
140
+ length: 4
141
+ zero_pad: true
142
+ globalConstant: true
143
+ transform:
144
+ case: upper
145
+ ```
146
+
147
+ ### Generators & parameters
148
+
149
+ | Generator | Required params | Optional params | Notes |
150
+ |------------------|-----------------|--------------------------------|--------------------------------------------------|
151
+ | `random_digits` | `length` | `zero_pad` (default `true`) | digits only; zero_pad keeps leading zeroes |
152
+ | `random_alnum` | `length` | `charset` | mix of letters/digits; `charset` restricts symbols. The default is `abcdefghijklmnopqrstuvwxyz0123456789`. |
153
+ | `choice` | `options` | `weights` (same length) | uniform when weights omitted |
154
+
155
+ Extras:
156
+
157
+ - `globalConstant: true` — reuse the same generated value across the statement (great for IDs).
158
+ - `transform.case`: `upper`, `lower`, or `title`.
159
+
160
+ ## Examples
161
+
162
+ ### Simple grocery merchant
163
+
164
+ ```yaml
165
+ - title: "Tesco Express"
166
+ prettyTitle: "Tesco Express"
167
+ currency: "GBP"
168
+ amountRange: {min: 5.0, max: 50.0}
169
+ amountFormat: 2
170
+ types: ["groceries", "shopping"]
171
+ weight: 120
172
+ ```
173
+
174
+ ### Subscription service
175
+
176
+ ```yaml
177
+ - title: "Netflix.com"
178
+ prettyTitle: "Netflix"
179
+ currency: "EUR"
180
+ amountRange: {min: 13.49, max: 13.49}
181
+ amountFormat: 2
182
+ subscriptionFrequencyDays: 30
183
+ numberOfOccurrences: 10
184
+ types: ["entertainment", "subscription"]
185
+ weight: 300
186
+ ```
187
+
188
+ ### Template with refund metadata
189
+
190
+ ```yaml
191
+ - title:
192
+ type: template
193
+ prettyTitle: "Airbnb"
194
+ template: "Airbnb * {code} 662-105-6167"
195
+ params:
196
+ code:
197
+ generator: random_alnum
198
+ length: 12
199
+ charset: "abcdefghijklmnopqrstuvwxyz0123456789"
200
+ transform:
201
+ case: lower
202
+ globalConstant: true
203
+ currency: "USD"
204
+ amountRange: {min: 70.0, max: 900.0}
205
+ amountFormat: 2
206
+ refundProbability: 0.4
207
+ types: ["housing"]
208
+ weight: 700
209
+ ```
210
+
211
+ ### Exact amounts (Fixed price points)
212
+
213
+ ```yaml
214
+ - title: "Spotify Premium"
215
+ prettyTitle: "Spotify"
216
+ currency: "EUR"
217
+ amounts: [4.99, 9.99, 14.99]
218
+ amountFormat: 2
219
+ types: ["subscription", "entertainment"]
220
+ weight: 200
221
+ ```
222
+
223
+ ## Pattern authoring workflow
224
+
225
+ 1. Pick the right file (`general.yml`, `eea.yml`, or `<country>.yml`).
226
+ 2. Study existing entries (Thailand’s file is a good reference for tone + “uglified” merchant names).
227
+ 3. Choose realistic `amountRange`, `amountFormat`, tags, and weights.
228
+ 4. Use templates when merchants expose reference numbers.
229
+ 5. Annotate generated blocks with comments (e.g., `# Generated transaction pattern - online food`).
230
+ 6. Refresh `prettyTitle` after title tweaks by applying the manual derivation rules (strip city/country noise, drop IDs, canonicalize big brands).
231
+ 7. Run `pytest` to validate against `schema.json` before committing/publishing.
232
+
233
+ ## Tests & release
234
+
235
+ ```bash
236
+ pytest # validates YAML + loader invariants
237
+ python -m build # optional local artifact check
238
+ ```
239
+
240
+ - CI: `.github/workflows/ci.yml` runs pytest on push/PR.
241
+ - Release automation: merge PRs into `main` with `major release`, `minor release`, or `patch release` labels to control how `.github/workflows/release-tagger.yml` bumps the version after CI finishes green. No label defaults to a build bump (`v1.2.3` → `v1.2.3.1`, etc.). The workflow updates `pyproject.toml` and tags the commit as `v<version>`.
242
+ - PyPI publish: semantic tags (`vMAJOR.MINOR.PATCH` with optional `.<build_or_label>`) trigger `.github/workflows/release.yml`. The release tagger simply creates the tag, so publishing is entirely driven by tag pushes (manual or automated).
243
+ - Need to generate new country patterns for a task? See `AGENTS.md` for the full enrichment workflow.
244
+
245
+ ## Pattern preview workflow
246
+
247
+ Pull requests that touch `solyanka/transaction_patterns/data/**` automatically run
248
+ `.github/workflows/pattern-preview.yml`. The workflow uses `python -m solyanka.pattern_preview`
249
+ to diff the branch against the PR base, synthesize up to three example transactions from the
250
+ touched patterns, and posts a Markdown table comment (including short/pretty titles) back onto the PR so reviewers can eyeball
251
+ the new merchants. Preview-only fixtures live under `tests/pattern_preview/` and are injected
252
+ via the workflow using the `--extra-patterns` flag so they stay separate from the shipped data.
253
+ If the rendered tables grow beyond GitHub’s comment limit, the workflow automatically splits
254
+ the output into sequential comments while keeping each pattern block intact.
255
+ Run the same command locally to preview the output before pushing changes:
256
+
257
+ ```bash
258
+ python -m solyanka.pattern_preview \
259
+ --base-ref origin/main \
260
+ --head-ref HEAD \
261
+ --samples-per-pattern 3 \
262
+ --extra-patterns tests/pattern_preview
263
+ ```
264
+
265
+ Conventions: keep YAML human-readable (sorted keys, helpful comments), avoid UUID-looking titles, and update schema/tests whenever the structure changes.
266
+
267
+ ## Purpose recap
268
+
269
+ Solyanka is the single source of truth for transaction-pattern assets used by the bank-statement
270
+ generator and any LLM training pipelines. Treat it like a dataset project: tight validation, small
271
+ focused API surface, deterministic releases.
@@ -0,0 +1,237 @@
1
+ # Solyanka
2
+
3
+ Toolkit + dataset for transaction-pattern driven synthetic statements and downstream LLM fine-tuning.
4
+ This package ships the curated YAML files, their schema, and a tiny loader that apps or notebooks can
5
+ use without worrying about file layout.
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ pip install solyanka # consumers
11
+ pip install -e ".[dev]" # local hacking (tests + linters)
12
+ ```
13
+
14
+ ## Runtime use
15
+
16
+ ```python
17
+ from solyanka import PatternsService
18
+
19
+ svc = PatternsService() # auto-discovers packaged data
20
+ general = svc.load_general_patterns()
21
+ eea = svc.load_eea_patterns()
22
+ thailand = svc.load_country_patterns("Thailand")
23
+
24
+ # Recommended helper: general + (EEA) + country
25
+ bundle = svc.get_country_patterns("Germany")
26
+
27
+ # Fine-grained slices (e.g., validation scripts)
28
+ custom = svc.get_patterns(country="Germany", include="general,eea")
29
+
30
+ # API-ready dictionaries
31
+ payload = svc.get_pattern_dicts(country="Spain")
32
+ ```
33
+
34
+ Override the dataset path (e.g., while editing YAML) via `PatternsService(base_dir=Path("./transaction_patterns"))`
35
+ or the `TRANSACTION_PATTERNS_DIR` environment variable.
36
+
37
+ ## Layout
38
+
39
+ - `solyanka/transaction_patterns/data/*.yml` — curated pattern files (`general.yml`, `eea.yml`, `<country>.yml`).
40
+ - `solyanka/transaction_patterns/data/schema.json` — JSON Schema enforced by tests/CI.
41
+ - `solyanka/transaction_patterns/service.py` — public loader API (keep backward compatible).
42
+ - `tests/` — schema regression + loader behaviour.
43
+
44
+ ## Field spec
45
+
46
+ ### Required fields
47
+
48
+ | Field | Meaning |
49
+ |------------------|---------------------------------------------------------------------------------------------------|
50
+ | `title` | Merchant label. Plain string or template object. |
51
+ | `prettyTitle` | Short merchant label for UI use. Strip cities/countries/punctuation manually (no scripts); follow the documented brand rules (Uber/Amazon/Airbnb/Youtube/Godaddy/Myprotein/Zenni/Bolt/Iherb/Lotus, etc.). |
52
+ | `currency` | Uppercase ISO 4217 (EUR, USD, GBP, ...). |
53
+ | `amountRange` | `{min, max}` floats describing the observed local-currency range. Mutually exclusive with `amounts`.|
54
+ | `amounts` | List of specific float amounts (e.g. `[9.99, 19.99]`). Use this **instead** of `amountRange` for fixed price points. |
55
+ | `amountFormat` | Rounding strategy: `n>0` decimals, `0` whole units, `n<0` powers of ten (e.g., `-2` rounds to 100s).|
56
+ | `types` | Non-empty list of lowercase tags (`shopping`, `restaurant`, `transportation`, ...). |
57
+ | `prettyTitle` | Short merchant label for UI use. Strip cities/countries/punctuation manually (no scripts); follow the documented brand rules (Uber/Amazon/Airbnb/Youtube/Godaddy/Myprotein/Zenni/Bolt/Iherb/Lotus, etc.). |
58
+
59
+ ### Optional fields
60
+
61
+ | Field | Why / how |
62
+ |-----------------------------|-----------------------------------------------------------------------------------------|
63
+ | `weight` | Relative selection probability. 100 = baseline, 120–150 very common, 50 niche. |
64
+ | `refundProbability` | Chance (0–1) that the generator emits a `CARD_REFUND` for this pattern. |
65
+ | `refundDelayMinHours`/`Max` | Boundaries for automatic refund timing (defaults: 72 / 288 hours). |
66
+ | `numberOfOccurrences` | Global cap per statement (useful for rare, one-off merchants). |
67
+ | `subscriptionFrequencyDays` | Frequency for recurring charges (e.g., 30 for monthly subscriptions). |
68
+ | `country` | Required when using `region`. Valid values defined in `schema.json`. |
69
+ | `region` | Geographic region within a country. Valid values defined in `schema.json`. |
70
+
71
+ ## Regions
72
+
73
+ The `region` field allows you to specify the geographic region within a country where a transaction pattern is localized. When using `region`, you must also specify `country` — the JSON schema validates that the region matches the country.
74
+
75
+ Valid countries and their regions are defined in `schema.json` (the `allOf` section with `if/then` rules). The schema is the single source of truth for allowed values.
76
+
77
+ **When to use regions:**
78
+
79
+ - Use `region` only when the transaction clearly belongs to a specific geographic area (e.g., a local restaurant, a regional shop).
80
+ - Do not set `region` for online services, nationwide chains, or when the location is ambiguous.
81
+ - When specifying `region`, you must also set `country` to the matching country name.
82
+
83
+ **Example with region:**
84
+
85
+ ```yaml
86
+ - title: "Patong Beach Hotel PHUKET"
87
+ prettyTitle: "Patong Beach Hotel"
88
+ currency: "THB"
89
+ amountRange: {min: 2000, max: 8000}
90
+ amountFormat: 0
91
+ types: ["hotel", "accommodation"]
92
+ country: "Thailand"
93
+ region: "Phuket"
94
+ ```
95
+
96
+ ## Template titles
97
+
98
+ ```yaml
99
+ title:
100
+ type: template
101
+ template: "Revolut**{num}* DUBLIN"
102
+ prettyTitle: "Revolut"
103
+ params:
104
+ num:
105
+ generator: random_digits
106
+ length: 4
107
+ zero_pad: true
108
+ globalConstant: true
109
+ transform:
110
+ case: upper
111
+ ```
112
+
113
+ ### Generators & parameters
114
+
115
+ | Generator | Required params | Optional params | Notes |
116
+ |------------------|-----------------|--------------------------------|--------------------------------------------------|
117
+ | `random_digits` | `length` | `zero_pad` (default `true`) | digits only; zero_pad keeps leading zeroes |
118
+ | `random_alnum` | `length` | `charset` | mix of letters/digits; `charset` restricts symbols. The default is `abcdefghijklmnopqrstuvwxyz0123456789`. |
119
+ | `choice` | `options` | `weights` (same length) | uniform when weights omitted |
120
+
121
+ Extras:
122
+
123
+ - `globalConstant: true` — reuse the same generated value across the statement (great for IDs).
124
+ - `transform.case`: `upper`, `lower`, or `title`.
125
+
126
+ ## Examples
127
+
128
+ ### Simple grocery merchant
129
+
130
+ ```yaml
131
+ - title: "Tesco Express"
132
+ prettyTitle: "Tesco Express"
133
+ currency: "GBP"
134
+ amountRange: {min: 5.0, max: 50.0}
135
+ amountFormat: 2
136
+ types: ["groceries", "shopping"]
137
+ weight: 120
138
+ ```
139
+
140
+ ### Subscription service
141
+
142
+ ```yaml
143
+ - title: "Netflix.com"
144
+ prettyTitle: "Netflix"
145
+ currency: "EUR"
146
+ amountRange: {min: 13.49, max: 13.49}
147
+ amountFormat: 2
148
+ subscriptionFrequencyDays: 30
149
+ numberOfOccurrences: 10
150
+ types: ["entertainment", "subscription"]
151
+ weight: 300
152
+ ```
153
+
154
+ ### Template with refund metadata
155
+
156
+ ```yaml
157
+ - title:
158
+ type: template
159
+ prettyTitle: "Airbnb"
160
+ template: "Airbnb * {code} 662-105-6167"
161
+ params:
162
+ code:
163
+ generator: random_alnum
164
+ length: 12
165
+ charset: "abcdefghijklmnopqrstuvwxyz0123456789"
166
+ transform:
167
+ case: lower
168
+ globalConstant: true
169
+ currency: "USD"
170
+ amountRange: {min: 70.0, max: 900.0}
171
+ amountFormat: 2
172
+ refundProbability: 0.4
173
+ types: ["housing"]
174
+ weight: 700
175
+ ```
176
+
177
+ ### Exact amounts (Fixed price points)
178
+
179
+ ```yaml
180
+ - title: "Spotify Premium"
181
+ prettyTitle: "Spotify"
182
+ currency: "EUR"
183
+ amounts: [4.99, 9.99, 14.99]
184
+ amountFormat: 2
185
+ types: ["subscription", "entertainment"]
186
+ weight: 200
187
+ ```
188
+
189
+ ## Pattern authoring workflow
190
+
191
+ 1. Pick the right file (`general.yml`, `eea.yml`, or `<country>.yml`).
192
+ 2. Study existing entries (Thailand’s file is a good reference for tone + “uglified” merchant names).
193
+ 3. Choose realistic `amountRange`, `amountFormat`, tags, and weights.
194
+ 4. Use templates when merchants expose reference numbers.
195
+ 5. Annotate generated blocks with comments (e.g., `# Generated transaction pattern - online food`).
196
+ 6. Refresh `prettyTitle` after title tweaks by applying the manual derivation rules (strip city/country noise, drop IDs, canonicalize big brands).
197
+ 7. Run `pytest` to validate against `schema.json` before committing/publishing.
198
+
199
+ ## Tests & release
200
+
201
+ ```bash
202
+ pytest # validates YAML + loader invariants
203
+ python -m build # optional local artifact check
204
+ ```
205
+
206
+ - CI: `.github/workflows/ci.yml` runs pytest on push/PR.
207
+ - Release automation: merge PRs into `main` with `major release`, `minor release`, or `patch release` labels to control how `.github/workflows/release-tagger.yml` bumps the version after CI finishes green. No label defaults to a build bump (`v1.2.3` → `v1.2.3.1`, etc.). The workflow updates `pyproject.toml` and tags the commit as `v<version>`.
208
+ - PyPI publish: semantic tags (`vMAJOR.MINOR.PATCH` with optional `.<build_or_label>`) trigger `.github/workflows/release.yml`. The release tagger simply creates the tag, so publishing is entirely driven by tag pushes (manual or automated).
209
+ - Need to generate new country patterns for a task? See `AGENTS.md` for the full enrichment workflow.
210
+
211
+ ## Pattern preview workflow
212
+
213
+ Pull requests that touch `solyanka/transaction_patterns/data/**` automatically run
214
+ `.github/workflows/pattern-preview.yml`. The workflow uses `python -m solyanka.pattern_preview`
215
+ to diff the branch against the PR base, synthesize up to three example transactions from the
216
+ touched patterns, and posts a Markdown table comment (including short/pretty titles) back onto the PR so reviewers can eyeball
217
+ the new merchants. Preview-only fixtures live under `tests/pattern_preview/` and are injected
218
+ via the workflow using the `--extra-patterns` flag so they stay separate from the shipped data.
219
+ If the rendered tables grow beyond GitHub’s comment limit, the workflow automatically splits
220
+ the output into sequential comments while keeping each pattern block intact.
221
+ Run the same command locally to preview the output before pushing changes:
222
+
223
+ ```bash
224
+ python -m solyanka.pattern_preview \
225
+ --base-ref origin/main \
226
+ --head-ref HEAD \
227
+ --samples-per-pattern 3 \
228
+ --extra-patterns tests/pattern_preview
229
+ ```
230
+
231
+ Conventions: keep YAML human-readable (sorted keys, helpful comments), avoid UUID-looking titles, and update schema/tests whenever the structure changes.
232
+
233
+ ## Purpose recap
234
+
235
+ Solyanka is the single source of truth for transaction-pattern assets used by the bank-statement
236
+ generator and any LLM training pipelines. Treat it like a dataset project: tight validation, small
237
+ focused API surface, deterministic releases.
@@ -0,0 +1,29 @@
1
+ [project]
2
+ name = "solyanka"
3
+ version = "0.3.0"
4
+ description = "Transaction pattern utilities and dataset for statement generators"
5
+ readme = "README.md"
6
+ requires-python = ">=3.11"
7
+ license = { file = "LICENSE" }
8
+ authors = [{ name = "Development Team", email = "dev@company.com" }]
9
+ dependencies = ["pyyaml>=6.0"]
10
+
11
+ [project.optional-dependencies]
12
+ dev = ["pytest>=8.2", "jsonschema>=4.23"]
13
+
14
+ [build-system]
15
+ requires = ["hatchling"]
16
+ build-backend = "hatchling.build"
17
+
18
+ [tool.hatch.build]
19
+ include = ["solyanka", "README.md", "LICENSE"]
20
+
21
+ [tool.hatch.build.targets.wheel]
22
+ packages = ["solyanka"]
23
+
24
+ [tool.pytest.ini_options]
25
+ addopts = "-ra --strict-markers --strict-config"
26
+ testpaths = ["tests"]
27
+ markers = [
28
+ "transaction_patterns: validate transaction pattern YAML files against the schema",
29
+ ]
@@ -0,0 +1,29 @@
1
+ """Solyanka – lightweight helpers for statement generation tooling."""
2
+
3
+ from importlib import metadata as importlib_metadata
4
+ from pathlib import Path
5
+
6
+ import tomllib
7
+
8
+ from .transaction_patterns.service import EEA_COUNTRIES, Pattern, PatternsService
9
+
10
+ __all__ = ["EEA_COUNTRIES", "Pattern", "PatternsService"]
11
+
12
+
13
+ def _read_local_version() -> str:
14
+ root = Path(__file__).resolve().parent.parent
15
+ pyproject = root / "pyproject.toml"
16
+ if not pyproject.exists():
17
+ return "0.0.0"
18
+ data = tomllib.loads(pyproject.read_text(encoding="utf-8"))
19
+ project = data.get("project", {})
20
+ version = project.get("version")
21
+ if isinstance(version, str):
22
+ return version
23
+ return "0.0.0"
24
+
25
+
26
+ try:
27
+ __version__ = importlib_metadata.version("solyanka")
28
+ except importlib_metadata.PackageNotFoundError: # pragma: no cover
29
+ __version__ = _read_local_version()