databridge-core 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. databridge_core-0.1.0/.github/workflows/ci.yml +53 -0
  2. databridge_core-0.1.0/.gitignore +12 -0
  3. databridge_core-0.1.0/LICENSE +21 -0
  4. databridge_core-0.1.0/PKG-INFO +124 -0
  5. databridge_core-0.1.0/README.md +74 -0
  6. databridge_core-0.1.0/examples/customers_a.csv +11 -0
  7. databridge_core-0.1.0/examples/customers_b.csv +10 -0
  8. databridge_core-0.1.0/examples/demo.py +58 -0
  9. databridge_core-0.1.0/pyproject.toml +76 -0
  10. databridge_core-0.1.0/src/databridge_core/__init__.py +72 -0
  11. databridge_core-0.1.0/src/databridge_core/_io.py +8 -0
  12. databridge_core-0.1.0/src/databridge_core/_types.py +125 -0
  13. databridge_core-0.1.0/src/databridge_core/cli.py +356 -0
  14. databridge_core-0.1.0/src/databridge_core/files.py +128 -0
  15. databridge_core-0.1.0/src/databridge_core/ingestion/__init__.py +15 -0
  16. databridge_core-0.1.0/src/databridge_core/ingestion/csv_loader.py +129 -0
  17. databridge_core-0.1.0/src/databridge_core/ingestion/ocr.py +45 -0
  18. databridge_core-0.1.0/src/databridge_core/ingestion/pdf.py +56 -0
  19. databridge_core-0.1.0/src/databridge_core/ingestion/table_parser.py +76 -0
  20. databridge_core-0.1.0/src/databridge_core/profiler/__init__.py +5 -0
  21. databridge_core-0.1.0/src/databridge_core/profiler/profile.py +120 -0
  22. databridge_core-0.1.0/src/databridge_core/reconciler/__init__.py +94 -0
  23. databridge_core-0.1.0/src/databridge_core/reconciler/differ.py +341 -0
  24. databridge_core-0.1.0/src/databridge_core/reconciler/fuzzy.py +149 -0
  25. databridge_core-0.1.0/src/databridge_core/reconciler/hasher.py +226 -0
  26. databridge_core-0.1.0/src/databridge_core/reconciler/merger.py +48 -0
  27. databridge_core-0.1.0/src/databridge_core/reconciler/transform.py +82 -0
  28. databridge_core-0.1.0/tests/conftest.py +59 -0
  29. databridge_core-0.1.0/tests/test_cli.py +70 -0
  30. databridge_core-0.1.0/tests/test_differ.py +161 -0
  31. databridge_core-0.1.0/tests/test_ingestion.py +92 -0
  32. databridge_core-0.1.0/tests/test_profiler.py +53 -0
  33. databridge_core-0.1.0/tests/test_reconciler.py +116 -0
@@ -0,0 +1,53 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Set up Python ${{ matrix.python-version }}
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: ${{ matrix.python-version }}
23
+
24
+ - name: Install dependencies
25
+ run: |
26
+ python -m pip install --upgrade pip
27
+ pip install -e ".[dev,fuzzy]"
28
+
29
+ - name: Lint with ruff
30
+ run: ruff check src/ tests/
31
+
32
+ - name: Run tests
33
+ run: pytest -v --tb=short
34
+
35
+ - name: Build package
36
+ run: python -m build
37
+
38
+ build-check:
39
+ runs-on: ubuntu-latest
40
+ steps:
41
+ - uses: actions/checkout@v4
42
+
43
+ - name: Set up Python
44
+ uses: actions/setup-python@v5
45
+ with:
46
+ python-version: "3.12"
47
+
48
+ - name: Install and verify
49
+ run: |
50
+ pip install -e ".[fuzzy]"
51
+ python -c "from databridge_core import compare_hashes, profile_data, load_csv; print('OK')"
52
+ databridge --version
53
+ databridge --help
@@ -0,0 +1,12 @@
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ dist/
5
+ build/
6
+ *.egg-info/
7
+ .eggs/
8
+ .pytest_cache/
9
+ .ruff_cache/
10
+ *.egg
11
+ .venv/
12
+ venv/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024-2026 DataBridge AI
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,124 @@
1
+ Metadata-Version: 2.4
2
+ Name: databridge-core
3
+ Version: 0.1.0
4
+ Summary: Upload your Chart of Accounts. Get a production-ready financial hierarchy and dbt models. Zero config.
5
+ Project-URL: Homepage, https://github.com/datanexum/databridge-core
6
+ Project-URL: Documentation, https://github.com/datanexum/databridge-core#readme
7
+ Project-URL: Repository, https://github.com/datanexum/databridge-core
8
+ Project-URL: Issues, https://github.com/datanexum/databridge-core/issues
9
+ Author-email: DataBridge AI <hello@databridgeai.com>
10
+ License-Expression: MIT
11
+ License-File: LICENSE
12
+ Keywords: csv,data,diff,etl,finance,fuzzy-match,profiling,reconciliation
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Financial and Insurance Industry
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Topic :: Office/Business :: Financial
23
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
24
+ Requires-Python: >=3.10
25
+ Requires-Dist: click>=8.0
26
+ Requires-Dist: pandas>=1.5
27
+ Requires-Dist: pydantic>=2.0
28
+ Requires-Dist: rich>=13.0
29
+ Provides-Extra: all
30
+ Requires-Dist: pillow>=9.0; extra == 'all'
31
+ Requires-Dist: pypdf>=3.0; extra == 'all'
32
+ Requires-Dist: pytesseract>=0.3; extra == 'all'
33
+ Requires-Dist: rapidfuzz>=3.0; extra == 'all'
34
+ Requires-Dist: sqlalchemy>=2.0; extra == 'all'
35
+ Provides-Extra: dev
36
+ Requires-Dist: build>=1.0; extra == 'dev'
37
+ Requires-Dist: pytest-cov>=4.0; extra == 'dev'
38
+ Requires-Dist: pytest>=7.0; extra == 'dev'
39
+ Requires-Dist: ruff>=0.1; extra == 'dev'
40
+ Provides-Extra: fuzzy
41
+ Requires-Dist: rapidfuzz>=3.0; extra == 'fuzzy'
42
+ Provides-Extra: ocr
43
+ Requires-Dist: pillow>=9.0; extra == 'ocr'
44
+ Requires-Dist: pytesseract>=0.3; extra == 'ocr'
45
+ Provides-Extra: pdf
46
+ Requires-Dist: pypdf>=3.0; extra == 'pdf'
47
+ Provides-Extra: sql
48
+ Requires-Dist: sqlalchemy>=2.0; extra == 'sql'
49
+ Description-Content-Type: text/markdown
50
+
51
+ # DataBridge Core
52
+
53
+ **Your finance team just spent 4 hours on VLOOKUP. This takes 5 seconds.**
54
+
55
+ DataBridge Core is a Python toolkit for data reconciliation, profiling, and ingestion. Compare CSV files, find fuzzy matches, detect schema drift, and clean messy data -- from the command line or Python.
56
+
57
+ ```bash
58
+ pip install databridge-core
59
+ ```
60
+
61
+ ## 5-Second Demo
62
+
63
+ ```bash
64
+ # Profile a file
65
+ databridge profile sales.csv
66
+
67
+ # Compare two sources -- find orphans, conflicts, match rate
68
+ databridge compare source.csv target.csv --keys id
69
+
70
+ # Fuzzy match names across systems
71
+ databridge fuzzy erp_accounts.csv gl_accounts.csv --column name --threshold 80
72
+ ```
73
+
74
+ ## Python API
75
+
76
+ ```python
77
+ from databridge_core import compare_hashes, profile_data, load_csv
78
+
79
+ # Profile your data
80
+ profile = profile_data("chart_of_accounts.csv")
81
+ print(f"{profile['rows']} rows, {profile['columns']} columns")
82
+ print(f"Potential keys: {profile['potential_key_columns']}")
83
+
84
+ # Compare two sources
85
+ result = compare_hashes("source.csv", "target.csv", key_columns="account_id")
86
+ stats = result["statistics"]
87
+ print(f"Match rate: {stats['match_rate_percent']}%")
88
+ print(f"Conflicts: {stats['conflicts']}, Orphans: {stats['total_orphans']}")
89
+ ```
90
+
91
+ ## Commands
92
+
93
+ | Command | Description |
94
+ |---------|-------------|
95
+ | `databridge profile <file>` | Profile data: structure, quality, cardinality |
96
+ | `databridge compare <a> <b> --keys <col>` | Hash comparison: orphans, conflicts, match rate |
97
+ | `databridge fuzzy <a> <b> -c <col>` | Fuzzy match columns across two files |
98
+ | `databridge diff <a> <b>` | Text diff between two files |
99
+ | `databridge drift <old> <new>` | Detect schema drift between CSVs |
100
+ | `databridge transform <file> -c <col> --op upper` | Clean a column (upper/lower/strip/trim/remove_special) |
101
+ | `databridge merge <a> <b> --keys <col>` | Merge two CSVs on key columns |
102
+ | `databridge find "*.csv"` | Find files matching a pattern |
103
+ | `databridge parse <text>` | Parse tabular data from messy text |
104
+
105
+ ## Optional Extras
106
+
107
+ ```bash
108
+ pip install 'databridge-core[fuzzy]' # Fuzzy matching (rapidfuzz)
109
+ pip install 'databridge-core[pdf]' # PDF text extraction (pypdf)
110
+ pip install 'databridge-core[ocr]' # OCR image extraction (pytesseract)
111
+ pip install 'databridge-core[sql]' # Database queries (sqlalchemy)
112
+ pip install 'databridge-core[all]' # Everything
113
+ pip install 'databridge-core[dev]' # Development tools (pytest, ruff, build)
114
+ ```
115
+
116
+ ## Built for Finance
117
+
118
+ DataBridge Core is the open-source foundation of [DataBridge AI](https://github.com/datanexum/databridge-ai) -- a full platform for financial hierarchy management, dbt model generation, and enterprise data reconciliation.
119
+
120
+ **How it works:** Upload your Chart of Accounts. Get a production-ready financial hierarchy and dbt models. Zero config.
121
+
122
+ ## License
123
+
124
+ MIT
@@ -0,0 +1,74 @@
1
+ # DataBridge Core
2
+
3
+ **Your finance team just spent 4 hours on VLOOKUP. This takes 5 seconds.**
4
+
5
+ DataBridge Core is a Python toolkit for data reconciliation, profiling, and ingestion. Compare CSV files, find fuzzy matches, detect schema drift, and clean messy data -- from the command line or Python.
6
+
7
+ ```bash
8
+ pip install databridge-core
9
+ ```
10
+
11
+ ## 5-Second Demo
12
+
13
+ ```bash
14
+ # Profile a file
15
+ databridge profile sales.csv
16
+
17
+ # Compare two sources -- find orphans, conflicts, match rate
18
+ databridge compare source.csv target.csv --keys id
19
+
20
+ # Fuzzy match names across systems
21
+ databridge fuzzy erp_accounts.csv gl_accounts.csv --column name --threshold 80
22
+ ```
23
+
24
+ ## Python API
25
+
26
+ ```python
27
+ from databridge_core import compare_hashes, profile_data, load_csv
28
+
29
+ # Profile your data
30
+ profile = profile_data("chart_of_accounts.csv")
31
+ print(f"{profile['rows']} rows, {profile['columns']} columns")
32
+ print(f"Potential keys: {profile['potential_key_columns']}")
33
+
34
+ # Compare two sources
35
+ result = compare_hashes("source.csv", "target.csv", key_columns="account_id")
36
+ stats = result["statistics"]
37
+ print(f"Match rate: {stats['match_rate_percent']}%")
38
+ print(f"Conflicts: {stats['conflicts']}, Orphans: {stats['total_orphans']}")
39
+ ```
40
+
41
+ ## Commands
42
+
43
+ | Command | Description |
44
+ |---------|-------------|
45
+ | `databridge profile <file>` | Profile data: structure, quality, cardinality |
46
+ | `databridge compare <a> <b> --keys <col>` | Hash comparison: orphans, conflicts, match rate |
47
+ | `databridge fuzzy <a> <b> -c <col>` | Fuzzy match columns across two files |
48
+ | `databridge diff <a> <b>` | Text diff between two files |
49
+ | `databridge drift <old> <new>` | Detect schema drift between CSVs |
50
+ | `databridge transform <file> -c <col> --op upper` | Clean a column (upper/lower/strip/trim/remove_special) |
51
+ | `databridge merge <a> <b> --keys <col>` | Merge two CSVs on key columns |
52
+ | `databridge find "*.csv"` | Find files matching a pattern |
53
+ | `databridge parse <text>` | Parse tabular data from messy text |
54
+
55
+ ## Optional Extras
56
+
57
+ ```bash
58
+ pip install 'databridge-core[fuzzy]' # Fuzzy matching (rapidfuzz)
59
+ pip install 'databridge-core[pdf]' # PDF text extraction (pypdf)
60
+ pip install 'databridge-core[ocr]' # OCR image extraction (pytesseract)
61
+ pip install 'databridge-core[sql]' # Database queries (sqlalchemy)
62
+ pip install 'databridge-core[all]' # Everything
63
+ pip install 'databridge-core[dev]' # Development tools (pytest, ruff, build)
64
+ ```
65
+
66
+ ## Built for Finance
67
+
68
+ DataBridge Core is the open-source foundation of [DataBridge AI](https://github.com/datanexum/databridge-ai) -- a full platform for financial hierarchy management, dbt model generation, and enterprise data reconciliation.
69
+
70
+ **How it works:** Upload your Chart of Accounts. Get a production-ready financial hierarchy and dbt models. Zero config.
71
+
72
+ ## License
73
+
74
+ MIT
@@ -0,0 +1,11 @@
1
+ id,name,email,city,balance
2
+ 1,Alice Johnson,alice@example.com,New York,1500.00
3
+ 2,Bob Smith,bob@example.com,Chicago,2300.50
4
+ 3,Charlie Brown,charlie@example.com,Houston,850.75
5
+ 4,Diana Prince,diana@example.com,Phoenix,3200.00
6
+ 5,Eve Williams,eve@example.com,San Antonio,1100.25
7
+ 6,Frank Castle,frank@example.com,Dallas,4500.00
8
+ 7,Grace Hopper,grace@example.com,San Jose,2750.30
9
+ 8,Hank Pym,hank@example.com,Austin,990.00
10
+ 9,Ivy League,ivy@example.com,Columbus,1800.60
11
+ 10,Jack Ryan,jack@example.com,Charlotte,3100.45
@@ -0,0 +1,10 @@
1
+ id,name,email,city,balance
2
+ 1,Alice Johnson,alice@example.com,New York,1500.00
3
+ 2,Bob Smith,bob@example.com,Chicago,2400.50
4
+ 3,Charles Brown,charlie@example.com,Houston,850.75
5
+ 4,Diana Prince,diana@example.com,Scottsdale,3200.00
6
+ 5,Eve Williams,eve@example.com,San Antonio,1100.25
7
+ 6,Frank Castle,frank@example.com,Dallas,4500.00
8
+ 7,Grace Hopper,grace@example.com,San Jose,2750.30
9
+ 8,Hank Pym,hank@example.com,Austin,990.00
10
+ 11,Kate Bishop,kate@example.com,Denver,2100.00
@@ -0,0 +1,58 @@
1
+ """DataBridge Core -- Quick demo.
2
+
3
+ Run: python examples/demo.py
4
+ """
5
+
6
+ from pathlib import Path
7
+
8
+ # Resolve example file paths
9
+ examples_dir = Path(__file__).parent
10
+ file_a = str(examples_dir / "customers_a.csv")
11
+ file_b = str(examples_dir / "customers_b.csv")
12
+
13
+
14
+ def main():
15
+ from databridge_core import compare_hashes, profile_data, load_csv
16
+
17
+ # 1. Profile the source file
18
+ print("=" * 60)
19
+ print("1. PROFILE SOURCE DATA")
20
+ print("=" * 60)
21
+ profile = profile_data(file_a)
22
+ print(f" File: {profile['file']}")
23
+ print(f" Rows: {profile['rows']}, Columns: {profile['columns']}")
24
+ print(f" Type: {profile['structure_type']}")
25
+ print(f" Potential keys: {profile['potential_key_columns']}")
26
+ print()
27
+
28
+ # 2. Compare two sources
29
+ print("=" * 60)
30
+ print("2. COMPARE SOURCES")
31
+ print("=" * 60)
32
+ result = compare_hashes(file_a, file_b, key_columns="id")
33
+ stats = result["statistics"]
34
+ print(f" Source A: {result['source_a']['total_rows']} rows")
35
+ print(f" Source B: {result['source_b']['total_rows']} rows")
36
+ print(f" Exact matches: {stats['exact_matches']}")
37
+ print(f" Conflicts: {stats['conflicts']}")
38
+ print(f" Orphans in A: {stats['orphans_only_in_source_a']}")
39
+ print(f" Orphans in B: {stats['orphans_only_in_source_b']}")
40
+ print(f" Match rate: {stats['match_rate_percent']}%")
41
+ print()
42
+
43
+ # 3. Load and preview
44
+ print("=" * 60)
45
+ print("3. LOAD & PREVIEW")
46
+ print("=" * 60)
47
+ loaded = load_csv(file_a, preview_rows=3)
48
+ print(f" Columns: {loaded['columns']}")
49
+ print(f" Preview (first 3 rows):")
50
+ for row in loaded["preview"]:
51
+ print(f" {row}")
52
+ print()
53
+
54
+ print("Done! Try the CLI: databridge profile examples/customers_a.csv")
55
+
56
+
57
+ if __name__ == "__main__":
58
+ main()
@@ -0,0 +1,76 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "databridge-core"
7
+ version = "0.1.0"
8
+ description = "Upload your Chart of Accounts. Get a production-ready financial hierarchy and dbt models. Zero config."
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.10"
12
+ authors = [
13
+ { name = "DataBridge AI", email = "hello@databridgeai.com" },
14
+ ]
15
+ keywords = ["data", "reconciliation", "profiling", "etl", "finance", "csv", "diff", "fuzzy-match"]
16
+ classifiers = [
17
+ "Development Status :: 4 - Beta",
18
+ "Intended Audience :: Developers",
19
+ "Intended Audience :: Financial and Insurance Industry",
20
+ "License :: OSI Approved :: MIT License",
21
+ "Programming Language :: Python :: 3",
22
+ "Programming Language :: Python :: 3.10",
23
+ "Programming Language :: Python :: 3.11",
24
+ "Programming Language :: Python :: 3.12",
25
+ "Programming Language :: Python :: 3.13",
26
+ "Topic :: Office/Business :: Financial",
27
+ "Topic :: Scientific/Engineering :: Information Analysis",
28
+ ]
29
+
30
+ dependencies = [
31
+ "pandas>=1.5",
32
+ "pydantic>=2.0",
33
+ "click>=8.0",
34
+ "rich>=13.0",
35
+ ]
36
+
37
+ [project.optional-dependencies]
38
+ fuzzy = ["rapidfuzz>=3.0"]
39
+ pdf = ["pypdf>=3.0"]
40
+ ocr = ["pytesseract>=0.3", "Pillow>=9.0"]
41
+ sql = ["sqlalchemy>=2.0"]
42
+ all = [
43
+ "rapidfuzz>=3.0",
44
+ "pypdf>=3.0",
45
+ "pytesseract>=0.3",
46
+ "Pillow>=9.0",
47
+ "sqlalchemy>=2.0",
48
+ ]
49
+ dev = [
50
+ "pytest>=7.0",
51
+ "pytest-cov>=4.0",
52
+ "ruff>=0.1",
53
+ "build>=1.0",
54
+ ]
55
+
56
+ [project.urls]
57
+ Homepage = "https://github.com/datanexum/databridge-core"
58
+ Documentation = "https://github.com/datanexum/databridge-core#readme"
59
+ Repository = "https://github.com/datanexum/databridge-core"
60
+ Issues = "https://github.com/datanexum/databridge-core/issues"
61
+
62
+ [project.scripts]
63
+ databridge = "databridge_core.cli:cli"
64
+
65
+ [tool.hatch.build.targets.wheel]
66
+ packages = ["src/databridge_core"]
67
+
68
+ [tool.ruff]
69
+ target-version = "py310"
70
+ line-length = 100
71
+
72
+ [tool.ruff.lint]
73
+ select = ["E", "F", "I", "W"]
74
+
75
+ [tool.pytest.ini_options]
76
+ testpaths = ["tests"]
@@ -0,0 +1,72 @@
1
+ """DataBridge Core -- Data reconciliation, profiling, and ingestion toolkit.
2
+
3
+ Upload your Chart of Accounts. Get a production-ready financial hierarchy
4
+ and dbt models. Zero config.
5
+
6
+ Quick start::
7
+
8
+ from databridge_core import compare_hashes, profile_data, load_csv
9
+
10
+ result = profile_data("sales.csv")
11
+ print(result["rows"], "rows,", result["columns"], "columns")
12
+
13
+ comparison = compare_hashes("source.csv", "target.csv", key_columns="id")
14
+ print(comparison["statistics"]["match_rate_percent"], "% match rate")
15
+ """
16
+
17
+ __version__ = "0.1.0"
18
+
19
+ # Reconciler
20
+ from .reconciler import (
21
+ compare_hashes,
22
+ get_orphan_details,
23
+ get_conflict_details,
24
+ fuzzy_match_columns,
25
+ fuzzy_deduplicate,
26
+ merge_sources,
27
+ compute_similarity,
28
+ diff_lists,
29
+ diff_dicts,
30
+ explain_diff,
31
+ find_close_matches,
32
+ find_similar_strings,
33
+ transform_column,
34
+ )
35
+
36
+ # Profiler
37
+ from .profiler import profile_data, detect_schema_drift
38
+
39
+ # Ingestion
40
+ from .ingestion import load_csv, load_json, extract_pdf_text, parse_table_from_text
41
+
42
+ # Files
43
+ from .files import find_files, stage_file
44
+
45
+ __all__ = [
46
+ "__version__",
47
+ # Reconciler
48
+ "compare_hashes",
49
+ "get_orphan_details",
50
+ "get_conflict_details",
51
+ "fuzzy_match_columns",
52
+ "fuzzy_deduplicate",
53
+ "merge_sources",
54
+ "compute_similarity",
55
+ "diff_lists",
56
+ "diff_dicts",
57
+ "explain_diff",
58
+ "find_close_matches",
59
+ "find_similar_strings",
60
+ "transform_column",
61
+ # Profiler
62
+ "profile_data",
63
+ "detect_schema_drift",
64
+ # Ingestion
65
+ "load_csv",
66
+ "load_json",
67
+ "extract_pdf_text",
68
+ "parse_table_from_text",
69
+ # Files
70
+ "find_files",
71
+ "stage_file",
72
+ ]
@@ -0,0 +1,8 @@
1
+ """Shared I/O helpers."""
2
+
3
+ import pandas as pd
4
+
5
+
6
+ def read_csv(file_path: str) -> pd.DataFrame:
7
+ """Read a CSV file into a DataFrame."""
8
+ return pd.read_csv(file_path)
@@ -0,0 +1,125 @@
1
+ """Shared result types for the databridge-core library.
2
+
3
+ All library functions return Python objects (dicts, dataclasses, Pydantic models).
4
+ """
5
+
6
+ from typing import Any, Dict, List, Optional
7
+
8
+ from pydantic import BaseModel, Field
9
+
10
+
11
+ # -- Profiler types --
12
+
13
+ class ProfileResult(BaseModel):
14
+ """Result of profiling a data source."""
15
+ file: str
16
+ rows: int
17
+ columns: int
18
+ structure_type: str
19
+ column_types: Dict[str, str]
20
+ potential_key_columns: List[str]
21
+ high_cardinality_cols: List[str]
22
+ low_cardinality_cols: List[str]
23
+ data_quality: Dict[str, Any]
24
+ statistics: Dict[str, Any]
25
+
26
+
27
+ class DriftResult(BaseModel):
28
+ """Result of schema drift detection."""
29
+ source_a: str
30
+ source_b: str
31
+ columns_added: List[str]
32
+ columns_removed: List[str]
33
+ columns_common: List[str]
34
+ type_changes: Dict[str, Dict[str, Any]]
35
+ has_drift: bool
36
+
37
+
38
+ # -- Reconciler types --
39
+
40
+ class CompareHashesResult(BaseModel):
41
+ """Result of hash-based row comparison."""
42
+ source_a: Dict[str, Any]
43
+ source_b: Dict[str, Any]
44
+ key_columns: List[str]
45
+ compare_columns: List[str]
46
+ statistics: Dict[str, Any]
47
+
48
+
49
+ class OrphanResult(BaseModel):
50
+ """Result of orphan record retrieval."""
51
+ orphan_source: str
52
+ orphans_in_a: Optional[Dict[str, Any]] = None
53
+ orphans_in_b: Optional[Dict[str, Any]] = None
54
+
55
+
56
+ class ConflictResult(BaseModel):
57
+ """Result of conflict detail retrieval."""
58
+ total_conflicts: int
59
+ showing: int
60
+ conflicts: List[Dict[str, Any]]
61
+
62
+
63
+ class FuzzyMatchResult(BaseModel):
64
+ """Result of fuzzy column matching."""
65
+ column_a: str
66
+ column_b: str
67
+ threshold: int
68
+ total_matches: int
69
+ top_matches: List[Dict[str, Any]]
70
+
71
+
72
+ class MergeResult(BaseModel):
73
+ """Result of merging two sources."""
74
+ source_a_rows: int
75
+ source_b_rows: int
76
+ merged_rows: int
77
+ merge_type: str
78
+ columns: List[str]
79
+ preview: List[Dict[str, Any]]
80
+
81
+
82
+ # -- Ingestion types --
83
+
84
+ class LoadResult(BaseModel):
85
+ """Result of loading a file."""
86
+ file: str
87
+ rows: int
88
+ columns: List[str]
89
+ preview: List[Dict[str, Any]]
90
+ dtypes: Optional[Dict[str, str]] = None
91
+ null_counts: Optional[Dict[str, int]] = None
92
+
93
+
94
+ class PdfExtractResult(BaseModel):
95
+ """Result of PDF text extraction."""
96
+ file: str
97
+ total_pages: int
98
+ pages_extracted: int
99
+ content: List[Dict[str, Any]]
100
+
101
+
102
+ class OcrResult(BaseModel):
103
+ """Result of OCR text extraction."""
104
+ file: str
105
+ language: str
106
+ text: str
107
+ character_count: int
108
+
109
+
110
+ class TableParseResult(BaseModel):
111
+ """Result of parsing tabular data from text."""
112
+ columns: Optional[List[str]] = None
113
+ row_count: Optional[int] = None
114
+ preview: Optional[List[Dict[str, Any]]] = None
115
+ raw_row: Optional[List[str]] = None
116
+
117
+
118
+ class QueryResult(BaseModel):
119
+ """Result of a database query."""
120
+ rows_returned: int
121
+ columns: List[str]
122
+ dtypes: Optional[Dict[str, str]] = None
123
+ preview: List[Dict[str, Any]]
124
+ truncated: bool = False
125
+ sql: Optional[str] = None