databridge-core 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databridge_core-0.1.0/.github/workflows/ci.yml +53 -0
- databridge_core-0.1.0/.gitignore +12 -0
- databridge_core-0.1.0/LICENSE +21 -0
- databridge_core-0.1.0/PKG-INFO +124 -0
- databridge_core-0.1.0/README.md +74 -0
- databridge_core-0.1.0/examples/customers_a.csv +11 -0
- databridge_core-0.1.0/examples/customers_b.csv +10 -0
- databridge_core-0.1.0/examples/demo.py +58 -0
- databridge_core-0.1.0/pyproject.toml +76 -0
- databridge_core-0.1.0/src/databridge_core/__init__.py +72 -0
- databridge_core-0.1.0/src/databridge_core/_io.py +8 -0
- databridge_core-0.1.0/src/databridge_core/_types.py +125 -0
- databridge_core-0.1.0/src/databridge_core/cli.py +356 -0
- databridge_core-0.1.0/src/databridge_core/files.py +128 -0
- databridge_core-0.1.0/src/databridge_core/ingestion/__init__.py +15 -0
- databridge_core-0.1.0/src/databridge_core/ingestion/csv_loader.py +129 -0
- databridge_core-0.1.0/src/databridge_core/ingestion/ocr.py +45 -0
- databridge_core-0.1.0/src/databridge_core/ingestion/pdf.py +56 -0
- databridge_core-0.1.0/src/databridge_core/ingestion/table_parser.py +76 -0
- databridge_core-0.1.0/src/databridge_core/profiler/__init__.py +5 -0
- databridge_core-0.1.0/src/databridge_core/profiler/profile.py +120 -0
- databridge_core-0.1.0/src/databridge_core/reconciler/__init__.py +94 -0
- databridge_core-0.1.0/src/databridge_core/reconciler/differ.py +341 -0
- databridge_core-0.1.0/src/databridge_core/reconciler/fuzzy.py +149 -0
- databridge_core-0.1.0/src/databridge_core/reconciler/hasher.py +226 -0
- databridge_core-0.1.0/src/databridge_core/reconciler/merger.py +48 -0
- databridge_core-0.1.0/src/databridge_core/reconciler/transform.py +82 -0
- databridge_core-0.1.0/tests/conftest.py +59 -0
- databridge_core-0.1.0/tests/test_cli.py +70 -0
- databridge_core-0.1.0/tests/test_differ.py +161 -0
- databridge_core-0.1.0/tests/test_ingestion.py +92 -0
- databridge_core-0.1.0/tests/test_profiler.py +53 -0
- databridge_core-0.1.0/tests/test_reconciler.py +116 -0
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.10", "3.11", "3.12", "3.13"]
|
|
15
|
+
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
20
|
+
uses: actions/setup-python@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: ${{ matrix.python-version }}
|
|
23
|
+
|
|
24
|
+
- name: Install dependencies
|
|
25
|
+
run: |
|
|
26
|
+
python -m pip install --upgrade pip
|
|
27
|
+
pip install -e ".[dev,fuzzy]"
|
|
28
|
+
|
|
29
|
+
- name: Lint with ruff
|
|
30
|
+
run: ruff check src/ tests/
|
|
31
|
+
|
|
32
|
+
- name: Run tests
|
|
33
|
+
run: pytest -v --tb=short
|
|
34
|
+
|
|
35
|
+
- name: Build package
|
|
36
|
+
run: python -m build
|
|
37
|
+
|
|
38
|
+
build-check:
|
|
39
|
+
runs-on: ubuntu-latest
|
|
40
|
+
steps:
|
|
41
|
+
- uses: actions/checkout@v4
|
|
42
|
+
|
|
43
|
+
- name: Set up Python
|
|
44
|
+
uses: actions/setup-python@v5
|
|
45
|
+
with:
|
|
46
|
+
python-version: "3.12"
|
|
47
|
+
|
|
48
|
+
- name: Install and verify
|
|
49
|
+
run: |
|
|
50
|
+
pip install -e ".[fuzzy]"
|
|
51
|
+
python -c "from databridge_core import compare_hashes, profile_data, load_csv; print('OK')"
|
|
52
|
+
databridge --version
|
|
53
|
+
databridge --help
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024-2026 DataBridge AI
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: databridge-core
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Upload your Chart of Accounts. Get a production-ready financial hierarchy and dbt models. Zero config.
|
|
5
|
+
Project-URL: Homepage, https://github.com/datanexum/databridge-core
|
|
6
|
+
Project-URL: Documentation, https://github.com/datanexum/databridge-core#readme
|
|
7
|
+
Project-URL: Repository, https://github.com/datanexum/databridge-core
|
|
8
|
+
Project-URL: Issues, https://github.com/datanexum/databridge-core/issues
|
|
9
|
+
Author-email: DataBridge AI <hello@databridgeai.com>
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: csv,data,diff,etl,finance,fuzzy-match,profiling,reconciliation
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Financial and Insurance Industry
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Topic :: Office/Business :: Financial
|
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
24
|
+
Requires-Python: >=3.10
|
|
25
|
+
Requires-Dist: click>=8.0
|
|
26
|
+
Requires-Dist: pandas>=1.5
|
|
27
|
+
Requires-Dist: pydantic>=2.0
|
|
28
|
+
Requires-Dist: rich>=13.0
|
|
29
|
+
Provides-Extra: all
|
|
30
|
+
Requires-Dist: pillow>=9.0; extra == 'all'
|
|
31
|
+
Requires-Dist: pypdf>=3.0; extra == 'all'
|
|
32
|
+
Requires-Dist: pytesseract>=0.3; extra == 'all'
|
|
33
|
+
Requires-Dist: rapidfuzz>=3.0; extra == 'all'
|
|
34
|
+
Requires-Dist: sqlalchemy>=2.0; extra == 'all'
|
|
35
|
+
Provides-Extra: dev
|
|
36
|
+
Requires-Dist: build>=1.0; extra == 'dev'
|
|
37
|
+
Requires-Dist: pytest-cov>=4.0; extra == 'dev'
|
|
38
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
39
|
+
Requires-Dist: ruff>=0.1; extra == 'dev'
|
|
40
|
+
Provides-Extra: fuzzy
|
|
41
|
+
Requires-Dist: rapidfuzz>=3.0; extra == 'fuzzy'
|
|
42
|
+
Provides-Extra: ocr
|
|
43
|
+
Requires-Dist: pillow>=9.0; extra == 'ocr'
|
|
44
|
+
Requires-Dist: pytesseract>=0.3; extra == 'ocr'
|
|
45
|
+
Provides-Extra: pdf
|
|
46
|
+
Requires-Dist: pypdf>=3.0; extra == 'pdf'
|
|
47
|
+
Provides-Extra: sql
|
|
48
|
+
Requires-Dist: sqlalchemy>=2.0; extra == 'sql'
|
|
49
|
+
Description-Content-Type: text/markdown
|
|
50
|
+
|
|
51
|
+
# DataBridge Core
|
|
52
|
+
|
|
53
|
+
**Your finance team just spent 4 hours on VLOOKUP. This takes 5 seconds.**
|
|
54
|
+
|
|
55
|
+
DataBridge Core is a Python toolkit for data reconciliation, profiling, and ingestion. Compare CSV files, find fuzzy matches, detect schema drift, and clean messy data -- from the command line or Python.
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
pip install databridge-core
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## 5-Second Demo
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
# Profile a file
|
|
65
|
+
databridge profile sales.csv
|
|
66
|
+
|
|
67
|
+
# Compare two sources -- find orphans, conflicts, match rate
|
|
68
|
+
databridge compare source.csv target.csv --keys id
|
|
69
|
+
|
|
70
|
+
# Fuzzy match names across systems
|
|
71
|
+
databridge fuzzy erp_accounts.csv gl_accounts.csv --column name --threshold 80
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## Python API
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from databridge_core import compare_hashes, profile_data, load_csv
|
|
78
|
+
|
|
79
|
+
# Profile your data
|
|
80
|
+
profile = profile_data("chart_of_accounts.csv")
|
|
81
|
+
print(f"{profile['rows']} rows, {profile['columns']} columns")
|
|
82
|
+
print(f"Potential keys: {profile['potential_key_columns']}")
|
|
83
|
+
|
|
84
|
+
# Compare two sources
|
|
85
|
+
result = compare_hashes("source.csv", "target.csv", key_columns="account_id")
|
|
86
|
+
stats = result["statistics"]
|
|
87
|
+
print(f"Match rate: {stats['match_rate_percent']}%")
|
|
88
|
+
print(f"Conflicts: {stats['conflicts']}, Orphans: {stats['total_orphans']}")
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## Commands
|
|
92
|
+
|
|
93
|
+
| Command | Description |
|
|
94
|
+
|---------|-------------|
|
|
95
|
+
| `databridge profile <file>` | Profile data: structure, quality, cardinality |
|
|
96
|
+
| `databridge compare <a> <b> --keys <col>` | Hash comparison: orphans, conflicts, match rate |
|
|
97
|
+
| `databridge fuzzy <a> <b> -c <col>` | Fuzzy match columns across two files |
|
|
98
|
+
| `databridge diff <a> <b>` | Text diff between two files |
|
|
99
|
+
| `databridge drift <old> <new>` | Detect schema drift between CSVs |
|
|
100
|
+
| `databridge transform <file> -c <col> --op upper` | Clean a column (upper/lower/strip/trim/remove_special) |
|
|
101
|
+
| `databridge merge <a> <b> --keys <col>` | Merge two CSVs on key columns |
|
|
102
|
+
| `databridge find "*.csv"` | Find files matching a pattern |
|
|
103
|
+
| `databridge parse <text>` | Parse tabular data from messy text |
|
|
104
|
+
|
|
105
|
+
## Optional Extras
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
pip install 'databridge-core[fuzzy]' # Fuzzy matching (rapidfuzz)
|
|
109
|
+
pip install 'databridge-core[pdf]' # PDF text extraction (pypdf)
|
|
110
|
+
pip install 'databridge-core[ocr]' # OCR image extraction (pytesseract)
|
|
111
|
+
pip install 'databridge-core[sql]' # Database queries (sqlalchemy)
|
|
112
|
+
pip install 'databridge-core[all]' # Everything
|
|
113
|
+
pip install 'databridge-core[dev]' # Development tools (pytest, ruff, build)
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## Built for Finance
|
|
117
|
+
|
|
118
|
+
DataBridge Core is the open-source foundation of [DataBridge AI](https://github.com/datanexum/databridge-ai) -- a full platform for financial hierarchy management, dbt model generation, and enterprise data reconciliation.
|
|
119
|
+
|
|
120
|
+
**How it works:** Upload your Chart of Accounts. Get a production-ready financial hierarchy and dbt models. Zero config.
|
|
121
|
+
|
|
122
|
+
## License
|
|
123
|
+
|
|
124
|
+
MIT
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# DataBridge Core
|
|
2
|
+
|
|
3
|
+
**Your finance team just spent 4 hours on VLOOKUP. This takes 5 seconds.**
|
|
4
|
+
|
|
5
|
+
DataBridge Core is a Python toolkit for data reconciliation, profiling, and ingestion. Compare CSV files, find fuzzy matches, detect schema drift, and clean messy data -- from the command line or Python.
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install databridge-core
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## 5-Second Demo
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
# Profile a file
|
|
15
|
+
databridge profile sales.csv
|
|
16
|
+
|
|
17
|
+
# Compare two sources -- find orphans, conflicts, match rate
|
|
18
|
+
databridge compare source.csv target.csv --keys id
|
|
19
|
+
|
|
20
|
+
# Fuzzy match names across systems
|
|
21
|
+
databridge fuzzy erp_accounts.csv gl_accounts.csv --column name --threshold 80
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Python API
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
from databridge_core import compare_hashes, profile_data, load_csv
|
|
28
|
+
|
|
29
|
+
# Profile your data
|
|
30
|
+
profile = profile_data("chart_of_accounts.csv")
|
|
31
|
+
print(f"{profile['rows']} rows, {profile['columns']} columns")
|
|
32
|
+
print(f"Potential keys: {profile['potential_key_columns']}")
|
|
33
|
+
|
|
34
|
+
# Compare two sources
|
|
35
|
+
result = compare_hashes("source.csv", "target.csv", key_columns="account_id")
|
|
36
|
+
stats = result["statistics"]
|
|
37
|
+
print(f"Match rate: {stats['match_rate_percent']}%")
|
|
38
|
+
print(f"Conflicts: {stats['conflicts']}, Orphans: {stats['total_orphans']}")
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Commands
|
|
42
|
+
|
|
43
|
+
| Command | Description |
|
|
44
|
+
|---------|-------------|
|
|
45
|
+
| `databridge profile <file>` | Profile data: structure, quality, cardinality |
|
|
46
|
+
| `databridge compare <a> <b> --keys <col>` | Hash comparison: orphans, conflicts, match rate |
|
|
47
|
+
| `databridge fuzzy <a> <b> -c <col>` | Fuzzy match columns across two files |
|
|
48
|
+
| `databridge diff <a> <b>` | Text diff between two files |
|
|
49
|
+
| `databridge drift <old> <new>` | Detect schema drift between CSVs |
|
|
50
|
+
| `databridge transform <file> -c <col> --op upper` | Clean a column (upper/lower/strip/trim/remove_special) |
|
|
51
|
+
| `databridge merge <a> <b> --keys <col>` | Merge two CSVs on key columns |
|
|
52
|
+
| `databridge find "*.csv"` | Find files matching a pattern |
|
|
53
|
+
| `databridge parse <text>` | Parse tabular data from messy text |
|
|
54
|
+
|
|
55
|
+
## Optional Extras
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
pip install 'databridge-core[fuzzy]' # Fuzzy matching (rapidfuzz)
|
|
59
|
+
pip install 'databridge-core[pdf]' # PDF text extraction (pypdf)
|
|
60
|
+
pip install 'databridge-core[ocr]' # OCR image extraction (pytesseract)
|
|
61
|
+
pip install 'databridge-core[sql]' # Database queries (sqlalchemy)
|
|
62
|
+
pip install 'databridge-core[all]' # Everything
|
|
63
|
+
pip install 'databridge-core[dev]' # Development tools (pytest, ruff, build)
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Built for Finance
|
|
67
|
+
|
|
68
|
+
DataBridge Core is the open-source foundation of [DataBridge AI](https://github.com/datanexum/databridge-ai) -- a full platform for financial hierarchy management, dbt model generation, and enterprise data reconciliation.
|
|
69
|
+
|
|
70
|
+
**How it works:** Upload your Chart of Accounts. Get a production-ready financial hierarchy and dbt models. Zero config.
|
|
71
|
+
|
|
72
|
+
## License
|
|
73
|
+
|
|
74
|
+
MIT
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
id,name,email,city,balance
|
|
2
|
+
1,Alice Johnson,alice@example.com,New York,1500.00
|
|
3
|
+
2,Bob Smith,bob@example.com,Chicago,2300.50
|
|
4
|
+
3,Charlie Brown,charlie@example.com,Houston,850.75
|
|
5
|
+
4,Diana Prince,diana@example.com,Phoenix,3200.00
|
|
6
|
+
5,Eve Williams,eve@example.com,San Antonio,1100.25
|
|
7
|
+
6,Frank Castle,frank@example.com,Dallas,4500.00
|
|
8
|
+
7,Grace Hopper,grace@example.com,San Jose,2750.30
|
|
9
|
+
8,Hank Pym,hank@example.com,Austin,990.00
|
|
10
|
+
9,Ivy League,ivy@example.com,Columbus,1800.60
|
|
11
|
+
10,Jack Ryan,jack@example.com,Charlotte,3100.45
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
id,name,email,city,balance
|
|
2
|
+
1,Alice Johnson,alice@example.com,New York,1500.00
|
|
3
|
+
2,Bob Smith,bob@example.com,Chicago,2400.50
|
|
4
|
+
3,Charles Brown,charlie@example.com,Houston,850.75
|
|
5
|
+
4,Diana Prince,diana@example.com,Scottsdale,3200.00
|
|
6
|
+
5,Eve Williams,eve@example.com,San Antonio,1100.25
|
|
7
|
+
6,Frank Castle,frank@example.com,Dallas,4500.00
|
|
8
|
+
7,Grace Hopper,grace@example.com,San Jose,2750.30
|
|
9
|
+
8,Hank Pym,hank@example.com,Austin,990.00
|
|
10
|
+
11,Kate Bishop,kate@example.com,Denver,2100.00
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""DataBridge Core -- Quick demo.
|
|
2
|
+
|
|
3
|
+
Run: python examples/demo.py
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
# Resolve example file paths
|
|
9
|
+
examples_dir = Path(__file__).parent
|
|
10
|
+
file_a = str(examples_dir / "customers_a.csv")
|
|
11
|
+
file_b = str(examples_dir / "customers_b.csv")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def main():
|
|
15
|
+
from databridge_core import compare_hashes, profile_data, load_csv
|
|
16
|
+
|
|
17
|
+
# 1. Profile the source file
|
|
18
|
+
print("=" * 60)
|
|
19
|
+
print("1. PROFILE SOURCE DATA")
|
|
20
|
+
print("=" * 60)
|
|
21
|
+
profile = profile_data(file_a)
|
|
22
|
+
print(f" File: {profile['file']}")
|
|
23
|
+
print(f" Rows: {profile['rows']}, Columns: {profile['columns']}")
|
|
24
|
+
print(f" Type: {profile['structure_type']}")
|
|
25
|
+
print(f" Potential keys: {profile['potential_key_columns']}")
|
|
26
|
+
print()
|
|
27
|
+
|
|
28
|
+
# 2. Compare two sources
|
|
29
|
+
print("=" * 60)
|
|
30
|
+
print("2. COMPARE SOURCES")
|
|
31
|
+
print("=" * 60)
|
|
32
|
+
result = compare_hashes(file_a, file_b, key_columns="id")
|
|
33
|
+
stats = result["statistics"]
|
|
34
|
+
print(f" Source A: {result['source_a']['total_rows']} rows")
|
|
35
|
+
print(f" Source B: {result['source_b']['total_rows']} rows")
|
|
36
|
+
print(f" Exact matches: {stats['exact_matches']}")
|
|
37
|
+
print(f" Conflicts: {stats['conflicts']}")
|
|
38
|
+
print(f" Orphans in A: {stats['orphans_only_in_source_a']}")
|
|
39
|
+
print(f" Orphans in B: {stats['orphans_only_in_source_b']}")
|
|
40
|
+
print(f" Match rate: {stats['match_rate_percent']}%")
|
|
41
|
+
print()
|
|
42
|
+
|
|
43
|
+
# 3. Load and preview
|
|
44
|
+
print("=" * 60)
|
|
45
|
+
print("3. LOAD & PREVIEW")
|
|
46
|
+
print("=" * 60)
|
|
47
|
+
loaded = load_csv(file_a, preview_rows=3)
|
|
48
|
+
print(f" Columns: {loaded['columns']}")
|
|
49
|
+
print(f" Preview (first 3 rows):")
|
|
50
|
+
for row in loaded["preview"]:
|
|
51
|
+
print(f" {row}")
|
|
52
|
+
print()
|
|
53
|
+
|
|
54
|
+
print("Done! Try the CLI: databridge profile examples/customers_a.csv")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
if __name__ == "__main__":
|
|
58
|
+
main()
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "databridge-core"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Upload your Chart of Accounts. Get a production-ready financial hierarchy and dbt models. Zero config."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "DataBridge AI", email = "hello@databridgeai.com" },
|
|
14
|
+
]
|
|
15
|
+
keywords = ["data", "reconciliation", "profiling", "etl", "finance", "csv", "diff", "fuzzy-match"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"Intended Audience :: Financial and Insurance Industry",
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
"Programming Language :: Python :: 3",
|
|
22
|
+
"Programming Language :: Python :: 3.10",
|
|
23
|
+
"Programming Language :: Python :: 3.11",
|
|
24
|
+
"Programming Language :: Python :: 3.12",
|
|
25
|
+
"Programming Language :: Python :: 3.13",
|
|
26
|
+
"Topic :: Office/Business :: Financial",
|
|
27
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
dependencies = [
|
|
31
|
+
"pandas>=1.5",
|
|
32
|
+
"pydantic>=2.0",
|
|
33
|
+
"click>=8.0",
|
|
34
|
+
"rich>=13.0",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[project.optional-dependencies]
|
|
38
|
+
fuzzy = ["rapidfuzz>=3.0"]
|
|
39
|
+
pdf = ["pypdf>=3.0"]
|
|
40
|
+
ocr = ["pytesseract>=0.3", "Pillow>=9.0"]
|
|
41
|
+
sql = ["sqlalchemy>=2.0"]
|
|
42
|
+
all = [
|
|
43
|
+
"rapidfuzz>=3.0",
|
|
44
|
+
"pypdf>=3.0",
|
|
45
|
+
"pytesseract>=0.3",
|
|
46
|
+
"Pillow>=9.0",
|
|
47
|
+
"sqlalchemy>=2.0",
|
|
48
|
+
]
|
|
49
|
+
dev = [
|
|
50
|
+
"pytest>=7.0",
|
|
51
|
+
"pytest-cov>=4.0",
|
|
52
|
+
"ruff>=0.1",
|
|
53
|
+
"build>=1.0",
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
[project.urls]
|
|
57
|
+
Homepage = "https://github.com/datanexum/databridge-core"
|
|
58
|
+
Documentation = "https://github.com/datanexum/databridge-core#readme"
|
|
59
|
+
Repository = "https://github.com/datanexum/databridge-core"
|
|
60
|
+
Issues = "https://github.com/datanexum/databridge-core/issues"
|
|
61
|
+
|
|
62
|
+
[project.scripts]
|
|
63
|
+
databridge = "databridge_core.cli:cli"
|
|
64
|
+
|
|
65
|
+
[tool.hatch.build.targets.wheel]
|
|
66
|
+
packages = ["src/databridge_core"]
|
|
67
|
+
|
|
68
|
+
[tool.ruff]
|
|
69
|
+
target-version = "py310"
|
|
70
|
+
line-length = 100
|
|
71
|
+
|
|
72
|
+
[tool.ruff.lint]
|
|
73
|
+
select = ["E", "F", "I", "W"]
|
|
74
|
+
|
|
75
|
+
[tool.pytest.ini_options]
|
|
76
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""DataBridge Core -- Data reconciliation, profiling, and ingestion toolkit.
|
|
2
|
+
|
|
3
|
+
Upload your Chart of Accounts. Get a production-ready financial hierarchy
|
|
4
|
+
and dbt models. Zero config.
|
|
5
|
+
|
|
6
|
+
Quick start::
|
|
7
|
+
|
|
8
|
+
from databridge_core import compare_hashes, profile_data, load_csv
|
|
9
|
+
|
|
10
|
+
result = profile_data("sales.csv")
|
|
11
|
+
print(result["rows"], "rows,", result["columns"], "columns")
|
|
12
|
+
|
|
13
|
+
comparison = compare_hashes("source.csv", "target.csv", key_columns="id")
|
|
14
|
+
print(comparison["statistics"]["match_rate_percent"], "% match rate")
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
__version__ = "0.1.0"
|
|
18
|
+
|
|
19
|
+
# Reconciler
|
|
20
|
+
from .reconciler import (
|
|
21
|
+
compare_hashes,
|
|
22
|
+
get_orphan_details,
|
|
23
|
+
get_conflict_details,
|
|
24
|
+
fuzzy_match_columns,
|
|
25
|
+
fuzzy_deduplicate,
|
|
26
|
+
merge_sources,
|
|
27
|
+
compute_similarity,
|
|
28
|
+
diff_lists,
|
|
29
|
+
diff_dicts,
|
|
30
|
+
explain_diff,
|
|
31
|
+
find_close_matches,
|
|
32
|
+
find_similar_strings,
|
|
33
|
+
transform_column,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
# Profiler
|
|
37
|
+
from .profiler import profile_data, detect_schema_drift
|
|
38
|
+
|
|
39
|
+
# Ingestion
|
|
40
|
+
from .ingestion import load_csv, load_json, extract_pdf_text, parse_table_from_text
|
|
41
|
+
|
|
42
|
+
# Files
|
|
43
|
+
from .files import find_files, stage_file
|
|
44
|
+
|
|
45
|
+
__all__ = [
|
|
46
|
+
"__version__",
|
|
47
|
+
# Reconciler
|
|
48
|
+
"compare_hashes",
|
|
49
|
+
"get_orphan_details",
|
|
50
|
+
"get_conflict_details",
|
|
51
|
+
"fuzzy_match_columns",
|
|
52
|
+
"fuzzy_deduplicate",
|
|
53
|
+
"merge_sources",
|
|
54
|
+
"compute_similarity",
|
|
55
|
+
"diff_lists",
|
|
56
|
+
"diff_dicts",
|
|
57
|
+
"explain_diff",
|
|
58
|
+
"find_close_matches",
|
|
59
|
+
"find_similar_strings",
|
|
60
|
+
"transform_column",
|
|
61
|
+
# Profiler
|
|
62
|
+
"profile_data",
|
|
63
|
+
"detect_schema_drift",
|
|
64
|
+
# Ingestion
|
|
65
|
+
"load_csv",
|
|
66
|
+
"load_json",
|
|
67
|
+
"extract_pdf_text",
|
|
68
|
+
"parse_table_from_text",
|
|
69
|
+
# Files
|
|
70
|
+
"find_files",
|
|
71
|
+
"stage_file",
|
|
72
|
+
]
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""Shared result types for the databridge-core library.
|
|
2
|
+
|
|
3
|
+
All library functions return Python objects (dicts, dataclasses, Pydantic models).
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import Any, Dict, List, Optional
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# -- Profiler types --
|
|
12
|
+
|
|
13
|
+
class ProfileResult(BaseModel):
|
|
14
|
+
"""Result of profiling a data source."""
|
|
15
|
+
file: str
|
|
16
|
+
rows: int
|
|
17
|
+
columns: int
|
|
18
|
+
structure_type: str
|
|
19
|
+
column_types: Dict[str, str]
|
|
20
|
+
potential_key_columns: List[str]
|
|
21
|
+
high_cardinality_cols: List[str]
|
|
22
|
+
low_cardinality_cols: List[str]
|
|
23
|
+
data_quality: Dict[str, Any]
|
|
24
|
+
statistics: Dict[str, Any]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class DriftResult(BaseModel):
|
|
28
|
+
"""Result of schema drift detection."""
|
|
29
|
+
source_a: str
|
|
30
|
+
source_b: str
|
|
31
|
+
columns_added: List[str]
|
|
32
|
+
columns_removed: List[str]
|
|
33
|
+
columns_common: List[str]
|
|
34
|
+
type_changes: Dict[str, Dict[str, Any]]
|
|
35
|
+
has_drift: bool
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# -- Reconciler types --
|
|
39
|
+
|
|
40
|
+
class CompareHashesResult(BaseModel):
|
|
41
|
+
"""Result of hash-based row comparison."""
|
|
42
|
+
source_a: Dict[str, Any]
|
|
43
|
+
source_b: Dict[str, Any]
|
|
44
|
+
key_columns: List[str]
|
|
45
|
+
compare_columns: List[str]
|
|
46
|
+
statistics: Dict[str, Any]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class OrphanResult(BaseModel):
|
|
50
|
+
"""Result of orphan record retrieval."""
|
|
51
|
+
orphan_source: str
|
|
52
|
+
orphans_in_a: Optional[Dict[str, Any]] = None
|
|
53
|
+
orphans_in_b: Optional[Dict[str, Any]] = None
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class ConflictResult(BaseModel):
|
|
57
|
+
"""Result of conflict detail retrieval."""
|
|
58
|
+
total_conflicts: int
|
|
59
|
+
showing: int
|
|
60
|
+
conflicts: List[Dict[str, Any]]
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class FuzzyMatchResult(BaseModel):
|
|
64
|
+
"""Result of fuzzy column matching."""
|
|
65
|
+
column_a: str
|
|
66
|
+
column_b: str
|
|
67
|
+
threshold: int
|
|
68
|
+
total_matches: int
|
|
69
|
+
top_matches: List[Dict[str, Any]]
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class MergeResult(BaseModel):
|
|
73
|
+
"""Result of merging two sources."""
|
|
74
|
+
source_a_rows: int
|
|
75
|
+
source_b_rows: int
|
|
76
|
+
merged_rows: int
|
|
77
|
+
merge_type: str
|
|
78
|
+
columns: List[str]
|
|
79
|
+
preview: List[Dict[str, Any]]
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
# -- Ingestion types --
|
|
83
|
+
|
|
84
|
+
class LoadResult(BaseModel):
|
|
85
|
+
"""Result of loading a file."""
|
|
86
|
+
file: str
|
|
87
|
+
rows: int
|
|
88
|
+
columns: List[str]
|
|
89
|
+
preview: List[Dict[str, Any]]
|
|
90
|
+
dtypes: Optional[Dict[str, str]] = None
|
|
91
|
+
null_counts: Optional[Dict[str, int]] = None
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class PdfExtractResult(BaseModel):
|
|
95
|
+
"""Result of PDF text extraction."""
|
|
96
|
+
file: str
|
|
97
|
+
total_pages: int
|
|
98
|
+
pages_extracted: int
|
|
99
|
+
content: List[Dict[str, Any]]
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class OcrResult(BaseModel):
|
|
103
|
+
"""Result of OCR text extraction."""
|
|
104
|
+
file: str
|
|
105
|
+
language: str
|
|
106
|
+
text: str
|
|
107
|
+
character_count: int
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class TableParseResult(BaseModel):
|
|
111
|
+
"""Result of parsing tabular data from text."""
|
|
112
|
+
columns: Optional[List[str]] = None
|
|
113
|
+
row_count: Optional[int] = None
|
|
114
|
+
preview: Optional[List[Dict[str, Any]]] = None
|
|
115
|
+
raw_row: Optional[List[str]] = None
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class QueryResult(BaseModel):
|
|
119
|
+
"""Result of a database query."""
|
|
120
|
+
rows_returned: int
|
|
121
|
+
columns: List[str]
|
|
122
|
+
dtypes: Optional[Dict[str, str]] = None
|
|
123
|
+
preview: List[Dict[str, Any]]
|
|
124
|
+
truncated: bool = False
|
|
125
|
+
sql: Optional[str] = None
|