opentaxonomy 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. opentaxonomy-0.1.0/.gitignore +34 -0
  2. opentaxonomy-0.1.0/PKG-INFO +130 -0
  3. opentaxonomy-0.1.0/README.md +111 -0
  4. opentaxonomy-0.1.0/files/README.md +66 -0
  5. opentaxonomy-0.1.0/files/digital_subscriptions.yaml +31 -0
  6. opentaxonomy-0.1.0/files/dining_out.yaml +28 -0
  7. opentaxonomy-0.1.0/files/discretionary.yaml +30 -0
  8. opentaxonomy-0.1.0/files/expenditure.yaml +27 -0
  9. opentaxonomy-0.1.0/files/groceries.yaml +22 -0
  10. opentaxonomy-0.1.0/files/placement_map.yaml +307 -0
  11. opentaxonomy-0.1.0/files/prima-seed.yaml +167 -0
  12. opentaxonomy-0.1.0/files/seed.yaml +102 -0
  13. opentaxonomy-0.1.0/pyproject.toml +33 -0
  14. opentaxonomy-0.1.0/src/opentaxonomy/__init__.py +1 -0
  15. opentaxonomy-0.1.0/src/opentaxonomy/cli.py +105 -0
  16. opentaxonomy-0.1.0/src/opentaxonomy/commands/__init__.py +0 -0
  17. opentaxonomy-0.1.0/src/opentaxonomy/commands/create.py +97 -0
  18. opentaxonomy-0.1.0/src/opentaxonomy/commands/run.py +82 -0
  19. opentaxonomy-0.1.0/src/opentaxonomy/io/__init__.py +0 -0
  20. opentaxonomy-0.1.0/src/opentaxonomy/io/base.py +15 -0
  21. opentaxonomy-0.1.0/src/opentaxonomy/io/db_sources.py +40 -0
  22. opentaxonomy-0.1.0/src/opentaxonomy/io/file_sources.py +73 -0
  23. opentaxonomy-0.1.0/src/opentaxonomy/llm/__init__.py +0 -0
  24. opentaxonomy-0.1.0/src/opentaxonomy/llm/client.py +62 -0
  25. opentaxonomy-0.1.0/src/opentaxonomy/llm/create_flow.py +389 -0
  26. opentaxonomy-0.1.0/src/opentaxonomy/llm/prompts.py +180 -0
  27. opentaxonomy-0.1.0/src/opentaxonomy/llm/run_flow.py +190 -0
  28. opentaxonomy-0.1.0/src/opentaxonomy/llm/schemas.py +91 -0
  29. opentaxonomy-0.1.0/src/opentaxonomy/utils/__init__.py +0 -0
  30. opentaxonomy-0.1.0/src/opentaxonomy/utils/canonical_id.py +15 -0
@@ -0,0 +1,34 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ *.egg
6
+ dist/
7
+ build/
8
+ .eggs/
9
+
10
+ # Virtual environments
11
+ .venv/
12
+ venv/
13
+ env/
14
+
15
+ # Distribution
16
+ dist/
17
+
18
+ # Environment
19
+ .env
20
+ .env.*
21
+
22
+ # IDE
23
+ .vscode/
24
+ .idea/
25
+
26
+ # Taxonomy output (generated data — not source)
27
+ taxonomy/
28
+
29
+ # Claude Code
30
+ .claude/
31
+
32
+ # OS
33
+ .DS_Store
34
+ Thumbs.db
@@ -0,0 +1,130 @@
1
+ Metadata-Version: 2.4
2
+ Name: opentaxonomy
3
+ Version: 0.1.0
4
+ Summary: LLM-powered semantic taxonomy generator for raw categorical data
5
+ License: MIT
6
+ Requires-Python: >=3.11
7
+ Requires-Dist: anthropic>=0.40.0
8
+ Requires-Dist: click>=8.1.0
9
+ Requires-Dist: openpyxl>=3.1.0
10
+ Requires-Dist: pandas>=2.0.0
11
+ Requires-Dist: pyarrow>=14.0.0
12
+ Requires-Dist: pydantic>=2.0.0
13
+ Requires-Dist: pyyaml>=6.0
14
+ Requires-Dist: rich>=13.0.0
15
+ Requires-Dist: sqlalchemy>=2.0.0
16
+ Provides-Extra: dev
17
+ Requires-Dist: pytest>=7.0; extra == 'dev'
18
+ Description-Content-Type: text/markdown
19
+
20
+ # OpenTaxonomy
21
+
22
+ LLM-powered semantic taxonomy generator for raw categorical data.
23
+
24
+ ## What it does
25
+
26
+ OpenTaxonomy takes a column of messy raw values (bank transactions, product names, survey responses — anything) and generates a structured semantic taxonomy from it using Claude as the reasoning engine.
27
+
28
+ **Output for each value:**
29
+ - `{column}_normalized` — cleaned entity name (e.g. `"REWE SAGT DANKE 46654184/..."` → `"REWE"`)
30
+ - `canonical_id` — taxonomy path (e.g. `ft.expenditure.variable.groceries`)
31
+
32
+ The taxonomy itself is a set of YAML files — a `seed.yaml` capturing the domain structure, one `node.yaml` per tree node (each an ontological contract with inclusion/exclusion criteria and a decision record), and a `placement_map.yaml` that is the source of truth for all mappings.
33
+
34
+ ## Architecture
35
+
36
+ The core is the **Prima Seed** — a universal questioning protocol that generates domain-specific taxonomic trees from any categorical data:
37
+
38
+ - **Q0** Identify the Form: what unifies all values?
39
+ - **Q0b** Establish context: what operational realm governs classification?
40
+ - **Q1** Primary differentiation: the most essential splitting criterion
41
+ - **Q2** Recursive differentiation: applied per branch at each level
42
+ - **Q3** Dialectical check: values that resist placement expose flawed criteria
43
+
44
+ ## Installation
45
+
46
+ ```bash
47
+ pip install opentaxonomy
48
+ ```
49
+
50
+ Requires Python 3.11+ and an [Anthropic API key](https://console.anthropic.com/).
51
+
52
+ ## Usage
53
+
54
+ ```bash
55
+ export ANTHROPIC_API_KEY=sk-ant-...
56
+
57
+ # Generate a new taxonomy from raw data
58
+ opentaxonomy create -i transactions.csv -c description -o ./taxonomy
59
+
60
+ # Place new/unseen values into an existing taxonomy
61
+ opentaxonomy run -i new_data.csv -c description -s ./taxonomy
62
+ ```
63
+
64
+ ### Supported input formats
65
+
66
+ | Format | Example |
67
+ |--------|---------|
68
+ | CSV / TSV | `data.csv`, `data.tsv` |
69
+ | JSON | `data.json` |
70
+ | Excel | `data.xlsx` |
71
+ | Parquet | `data.parquet` |
72
+ | Database | `postgresql://user:pass@host/db` + `--db-table` |
73
+
74
+ ### Options
75
+
76
+ ```
77
+ opentaxonomy create
78
+ -i, --input Input file or database connection string [required]
79
+ -c, --column Column containing raw values to classify [required]
80
+ -o, --output-dir Where to write taxonomy files [default: ./taxonomy]
81
+ --domain-hint Optional hint to guide the LLM (e.g. "German grocery products")
82
+ --model Claude model [default: claude-sonnet-4-6]
83
+ --api-key Anthropic API key (or set ANTHROPIC_API_KEY)
84
+
85
+ opentaxonomy run
86
+ -i, --input Input file or database connection string [required]
87
+ -c, --column Column containing raw values to classify [required]
88
+ -s, --seed-dir Directory with seed.yaml and placement_map.yaml [default: ./taxonomy]
89
+ --model Claude model [default: claude-sonnet-4-6]
90
+ --api-key Anthropic API key (or set ANTHROPIC_API_KEY)
91
+ ```
92
+
93
+ ## Output structure
94
+
95
+ ```
96
+ taxonomy/
97
+ ├── seed.yaml # Domain seed: context, levels, edge cases
98
+ ├── placement_map.yaml # Raw values → canonical IDs (source of truth)
99
+ └── nodes/
100
+ ├── root.yaml # Root node
101
+ ├── expenditure.yaml # Internal node with decision record
102
+ ├── groceries.yaml # Leaf node with inclusion/exclusion criteria
103
+ └── ...
104
+ ```
105
+
106
+ Each node file is an ontological contract:
107
+
108
+ ```yaml
109
+ node: Groceries
110
+ canonical_id: ft.expenditure.variable.groceries
111
+ question: Is this a purchase of food or household staples from a supermarket or grocery store?
112
+ criteria:
113
+ includes:
114
+ - Supermarket purchases (REWE, LIDL, EDEKA, etc.)
115
+ - Organic/bio market purchases
116
+ excludes:
117
+ - Restaurant meals
118
+ - Drugstore purchases unless food items
119
+ edge_cases:
120
+ - term: REWE TO GO
121
+ resolution: Included — still a grocery/convenience purchase
122
+ decided: true
123
+ parent: Variable Necessities
124
+ children: []
125
+ version: 1.0.0
126
+ ```
127
+
128
+ ## License
129
+
130
+ MIT
@@ -0,0 +1,111 @@
1
+ # OpenTaxonomy
2
+
3
+ LLM-powered semantic taxonomy generator for raw categorical data.
4
+
5
+ ## What it does
6
+
7
+ OpenTaxonomy takes a column of messy raw values (bank transactions, product names, survey responses — anything) and generates a structured semantic taxonomy from it using Claude as the reasoning engine.
8
+
9
+ **Output for each value:**
10
+ - `{column}_normalized` — cleaned entity name (e.g. `"REWE SAGT DANKE 46654184/..."` → `"REWE"`)
11
+ - `canonical_id` — taxonomy path (e.g. `ft.expenditure.variable.groceries`)
12
+
13
+ The taxonomy itself is a set of YAML files — a `seed.yaml` capturing the domain structure, one `node.yaml` per tree node (each an ontological contract with inclusion/exclusion criteria and a decision record), and a `placement_map.yaml` that is the source of truth for all mappings.
14
+
15
+ ## Architecture
16
+
17
+ The core is the **Prima Seed** — a universal questioning protocol that generates domain-specific taxonomic trees from any categorical data:
18
+
19
+ - **Q0** Identify the Form: what unifies all values?
20
+ - **Q0b** Establish context: what operational realm governs classification?
21
+ - **Q1** Primary differentiation: the most essential splitting criterion
22
+ - **Q2** Recursive differentiation: applied per branch at each level
23
+ - **Q3** Dialectical check: values that resist placement expose flawed criteria
24
+
25
+ ## Installation
26
+
27
+ ```bash
28
+ pip install opentaxonomy
29
+ ```
30
+
31
+ Requires Python 3.11+ and an [Anthropic API key](https://console.anthropic.com/).
32
+
33
+ ## Usage
34
+
35
+ ```bash
36
+ export ANTHROPIC_API_KEY=sk-ant-...
37
+
38
+ # Generate a new taxonomy from raw data
39
+ opentaxonomy create -i transactions.csv -c description -o ./taxonomy
40
+
41
+ # Place new/unseen values into an existing taxonomy
42
+ opentaxonomy run -i new_data.csv -c description -s ./taxonomy
43
+ ```
44
+
45
+ ### Supported input formats
46
+
47
+ | Format | Example |
48
+ |--------|---------|
49
+ | CSV / TSV | `data.csv`, `data.tsv` |
50
+ | JSON | `data.json` |
51
+ | Excel | `data.xlsx` |
52
+ | Parquet | `data.parquet` |
53
+ | Database | `postgresql://user:pass@host/db` + `--db-table` |
54
+
55
+ ### Options
56
+
57
+ ```
58
+ opentaxonomy create
59
+ -i, --input Input file or database connection string [required]
60
+ -c, --column Column containing raw values to classify [required]
61
+ -o, --output-dir Where to write taxonomy files [default: ./taxonomy]
62
+ --domain-hint Optional hint to guide the LLM (e.g. "German grocery products")
63
+ --model Claude model [default: claude-sonnet-4-6]
64
+ --api-key Anthropic API key (or set ANTHROPIC_API_KEY)
65
+
66
+ opentaxonomy run
67
+ -i, --input Input file or database connection string [required]
68
+ -c, --column Column containing raw values to classify [required]
69
+ -s, --seed-dir Directory with seed.yaml and placement_map.yaml [default: ./taxonomy]
70
+ --model Claude model [default: claude-sonnet-4-6]
71
+ --api-key Anthropic API key (or set ANTHROPIC_API_KEY)
72
+ ```
73
+
74
+ ## Output structure
75
+
76
+ ```
77
+ taxonomy/
78
+ ├── seed.yaml # Domain seed: context, levels, edge cases
79
+ ├── placement_map.yaml # Raw values → canonical IDs (source of truth)
80
+ └── nodes/
81
+ ├── root.yaml # Root node
82
+ ├── expenditure.yaml # Internal node with decision record
83
+ ├── groceries.yaml # Leaf node with inclusion/exclusion criteria
84
+ └── ...
85
+ ```
86
+
87
+ Each node file is an ontological contract:
88
+
89
+ ```yaml
90
+ node: Groceries
91
+ canonical_id: ft.expenditure.variable.groceries
92
+ question: Is this a purchase of food or household staples from a supermarket or grocery store?
93
+ criteria:
94
+ includes:
95
+ - Supermarket purchases (REWE, LIDL, EDEKA, etc.)
96
+ - Organic/bio market purchases
97
+ excludes:
98
+ - Restaurant meals
99
+ - Drugstore purchases unless food items
100
+ edge_cases:
101
+ - term: REWE TO GO
102
+ resolution: Included — still a grocery/convenience purchase
103
+ decided: true
104
+ parent: Variable Necessities
105
+ children: []
106
+ version: 1.0.0
107
+ ```
108
+
109
+ ## License
110
+
111
+ MIT
@@ -0,0 +1,66 @@
1
+ # OpenTaxonomy v0.1.0 — First Output
2
+
3
+ ## What is this?
4
+
5
+ This is the first concrete output of the OpenTaxonomy project — a proof-of-concept
6
+ demonstrating the Prima Seed protocol applied to real-world data.
7
+
8
+ ## Structure
9
+
10
+ ```
11
+ opentaxonomy/
12
+ ├── README.md ← You are here
13
+ └── seeds/
14
+ ├── prima/
15
+ │ └── prima-seed.yaml ← The universal questioning protocol
16
+ └── personal-finance-transactions/
17
+ ├── seed.yaml ← Domain seed (generated by Prima Seed)
18
+ ├── placement_map.yaml ← Raw values → canonical IDs
19
+ └── nodes/
20
+ ├── root.yaml ← Financial Transaction (root)
21
+ ├── income.yaml ← Income
22
+ ├── employment.yaml ← Employment Income (leaf)
23
+ ├── government_benefits.yaml ← Government Benefits (leaf)
24
+ ├── expenditure.yaml ← Expenditure
25
+ ├── fixed_obligations.yaml ← Fixed Obligations
26
+ ├── variable_necessities.yaml ← Variable Necessities
27
+ ├── discretionary.yaml ← Discretionary
28
+ ├── groceries.yaml ← Groceries (leaf)
29
+ ├── dining_out.yaml ← Dining Out (leaf)
30
+ └── digital_subscriptions.yaml ← Digital Subscriptions (leaf)
31
+ ```
32
+
33
+ ## Key Concepts
34
+
35
+ - **Prima Seed**: The universal meta-protocol. Domain-agnostic questions that
36
+ generate domain-specific trees from any categorical data.
37
+
38
+ - **Domain Seed**: A specific questioning protocol for a domain (e.g., personal
39
+ finance transactions). Generated by running the Prima Seed against real data.
40
+ Reusable by anyone with similar data.
41
+
42
+ - **Node**: An ontological contract — a YAML file carrying inclusion criteria,
43
+ exclusion criteria, edge cases, and decision records. Each node is one file.
44
+
45
+ - **Canonical ID**: A deterministic identifier derived from the criteria path
46
+ (e.g., `ft.expenditure.variable.groceries`). Enables semantic joins.
47
+
48
+ - **Placement Map**: The mapping of raw data values to canonical IDs. This is
49
+ where meaning is assigned to data.
50
+
51
+ ## How to Read This
52
+
53
+ 1. Start with `prima/prima-seed.yaml` — understand the questioning protocol
54
+ 2. Read `personal-finance-transactions/seed.yaml` — see how a domain seed
55
+ captures the context and level structure
56
+ 3. Browse `nodes/` — each file is a self-contained ontological contract
57
+ 4. Check `placement_map.yaml` — see how messy real-world bank transactions
58
+ get assigned to canonical IDs with meaning
59
+
60
+ ## What's Next
61
+
62
+ - [ ] Build a CLI runner that executes the Prima Seed against any data column
63
+ - [ ] Define canonical ID hashing algorithm
64
+ - [ ] Build the linking ID mechanism (cross-tree semantic comparison)
65
+ - [ ] Create second domain seed (German Grocery Products) for cross-domain testing
66
+ - [ ] Define the seed registry format for OpenTaxonomy platform
@@ -0,0 +1,31 @@
1
+ node: Digital Subscriptions
2
+ canonical_id: "ft.expenditure.fixed.subscriptions"
3
+ question: "Is this a recurring payment for a digital service or platform?"
4
+ criteria:
5
+ includes:
6
+ - "Streaming services (Netflix, Disney+, Spotify, MUBI, Audible)"
7
+ - "Software subscriptions (ChatGPT, Wix, Apple services)"
8
+ - "Gaming services (Nintendo, Google Play)"
9
+ - "Internet service (HerzoMedia/HERZOvision)"
10
+ excludes:
11
+ - "One-time digital purchases"
12
+ - "Physical subscription boxes"
13
+ - "Mobile phone contract (classified under telecommunications)"
14
+ edge_cases:
15
+ - term: "Amazon Prime"
16
+ resolution: "Included — recurring digital service even though it enables physical delivery"
17
+ decided: true
18
+ - term: "HerzoMedia/HERZOvision"
19
+ resolution: "Included — internet is a digital subscription; could also be telecommunications"
20
+ decided: false
21
+ parent: Fixed Obligations
22
+ children: []
23
+ version: 1.0.0
24
+ decision_record:
25
+ criterion_chosen: "Recurring billing for digital access"
26
+ alternatives_considered:
27
+ - "Split by media type (video, audio, software)"
28
+ reason: >
29
+ In budgeting, what matters is that these are fixed monthly
30
+ costs that can be individually cancelled. Media type is
31
+ secondary to the financial commitment structure.
@@ -0,0 +1,28 @@
1
+ node: Dining Out
2
+ canonical_id: "ft.expenditure.discretionary.dining"
3
+ question: "Is this a payment at a restaurant, cafe, bar, or food service establishment?"
4
+ criteria:
5
+ includes:
6
+ - "Full-service restaurants"
7
+ - "Fast food chains (McDonalds, Five Guys, etc.)"
8
+ - "Cafes and coffee shops (Starbucks, etc.)"
9
+ - "Bars and beer gardens"
10
+ - "Takeaway/delivery services (Lieferando, Takeaway.com)"
11
+ - "Corporate canteens (Eurest)"
12
+ - "Bakery cafes when dining in"
13
+ excludes:
14
+ - "Supermarket food purchases"
15
+ - "Grocery delivery services"
16
+ edge_cases:
17
+ - term: "Bakery (Baeckerei und Konditorei)"
18
+ resolution: "Included — the purchase is prepared food for immediate consumption"
19
+ decided: true
20
+ - term: "Food Affairs GmbH"
21
+ resolution: "Included — appears to be a food service/catering entity"
22
+ decided: true
23
+ - term: "Airport food (Relay, Exki, Gru Fridays)"
24
+ resolution: "Included — dining out regardless of location"
25
+ decided: true
26
+ parent: Discretionary
27
+ children: []
28
+ version: 1.0.0
@@ -0,0 +1,30 @@
1
+ node: Discretionary
2
+ canonical_id: "ft.expenditure.discretionary"
3
+ question: "Is this a lifestyle choice — spending that could be reduced or eliminated without impacting basic needs?"
4
+ criteria:
5
+ includes:
6
+ - "Restaurant meals and dining out"
7
+ - "Fashion and retail shopping"
8
+ - "Online shopping (non-essential)"
9
+ - "Entertainment, leisure, cultural activities"
10
+ - "Travel and accommodation"
11
+ excludes:
12
+ - "Contractual obligations"
13
+ - "Essential groceries and household supplies"
14
+ - "Health-related spending"
15
+ - "Commute-related transport"
16
+ edge_cases:
17
+ - term: "Fast food (McDonalds)"
18
+ resolution: "Included — discretionary even if routine; not a nutritional necessity"
19
+ decided: true
20
+ - term: "Eurest corporate canteen"
21
+ resolution: "Included — could bring lunch from home; dining choice"
22
+ decided: true
23
+ parent: Expenditure
24
+ children:
25
+ - dining_out
26
+ - shopping_fashion
27
+ - shopping_online_general
28
+ - entertainment_leisure
29
+ - travel_accommodation
30
+ version: 1.0.0
@@ -0,0 +1,27 @@
1
+ node: Expenditure
2
+ canonical_id: "ft.expenditure"
3
+ question: "Is money flowing out of the account?"
4
+ criteria:
5
+ includes:
6
+ - "Any debit, card payment, direct debit, standing order, or cash withdrawal"
7
+ excludes:
8
+ - "Incoming payments"
9
+ parent: Financial Transaction
10
+ children:
11
+ - fixed_obligations
12
+ - variable_necessities
13
+ - discretionary
14
+ - personal_transfers
15
+ - cash_withdrawals
16
+ version: 1.0.0
17
+ decision_record:
18
+ criterion_chosen: "Degree of obligation — how controllable is this spending?"
19
+ alternatives_considered:
20
+ - "Merchant type (where did I spend?)"
21
+ - "Payment method (card vs. transfer vs. cash)"
22
+ - "Amount range"
23
+ reason: >
24
+ In a personal budgeting context, the most actionable split
25
+ is controllability. Fixed obligations are locked in; variable
26
+ necessities can be optimized; discretionary can be cut. This
27
+ maps directly to how a person can act on their budget.
@@ -0,0 +1,22 @@
1
+ node: Groceries
2
+ canonical_id: "ft.expenditure.variable.groceries"
3
+ question: "Is this a purchase of food or household staples from a supermarket or grocery store?"
4
+ criteria:
5
+ includes:
6
+ - "Supermarket purchases (REWE, LIDL, EDEKA, etc.)"
7
+ - "Organic/bio market purchases"
8
+ - "International grocery equivalents (Carrefour, Pao de Acucar, CONAD, etc.)"
9
+ excludes:
10
+ - "Restaurant meals (even takeaway from non-grocery)"
11
+ - "Specialty food shops that are primarily dining (bakery cafe)"
12
+ - "Drugstore purchases (Rossmann, DM) unless food items"
13
+ edge_cases:
14
+ - term: "REWE TO GO"
15
+ resolution: "Included — still a grocery/convenience purchase, not a restaurant"
16
+ decided: true
17
+ - term: "Biomarkt"
18
+ resolution: "Included — organic grocery store"
19
+ decided: true
20
+ parent: Variable Necessities
21
+ children: []
22
+ version: 1.0.0