boti-data 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. boti_data-0.1.0/PKG-INFO +189 -0
  2. boti_data-0.1.0/README.md +160 -0
  3. boti_data-0.1.0/pyproject.toml +64 -0
  4. boti_data-0.1.0/setup.cfg +4 -0
  5. boti_data-0.1.0/src/boti_data/__init__.py +92 -0
  6. boti_data-0.1.0/src/boti_data/arrow_schema.py +326 -0
  7. boti_data-0.1.0/src/boti_data/connection_catalog.py +121 -0
  8. boti_data-0.1.0/src/boti_data/db/__init__.py +42 -0
  9. boti_data-0.1.0/src/boti_data/db/arrow_schema_mapper.py +331 -0
  10. boti_data-0.1.0/src/boti_data/db/engine_registry.py +113 -0
  11. boti_data-0.1.0/src/boti_data/db/partitioned_execution.py +333 -0
  12. boti_data-0.1.0/src/boti_data/db/partitioned_loader.py +169 -0
  13. boti_data-0.1.0/src/boti_data/db/partitioned_planner.py +429 -0
  14. boti_data-0.1.0/src/boti_data/db/partitioned_types.py +123 -0
  15. boti_data-0.1.0/src/boti_data/db/sql_config.py +154 -0
  16. boti_data-0.1.0/src/boti_data/db/sql_engine.py +330 -0
  17. boti_data-0.1.0/src/boti_data/db/sql_manager.py +43 -0
  18. boti_data-0.1.0/src/boti_data/db/sql_model_builder.py +79 -0
  19. boti_data-0.1.0/src/boti_data/db/sql_model_registry.py +384 -0
  20. boti_data-0.1.0/src/boti_data/db/sql_readonly.py +76 -0
  21. boti_data-0.1.0/src/boti_data/db/sql_resource.py +180 -0
  22. boti_data-0.1.0/src/boti_data/db/sqlalchemy_async.py +19 -0
  23. boti_data-0.1.0/src/boti_data/distributed.py +182 -0
  24. boti_data-0.1.0/src/boti_data/field_map.py +177 -0
  25. boti_data-0.1.0/src/boti_data/filters/__init__.py +25 -0
  26. boti_data-0.1.0/src/boti_data/filters/arrow_kernels.py +351 -0
  27. boti_data-0.1.0/src/boti_data/filters/expressions.py +149 -0
  28. boti_data-0.1.0/src/boti_data/filters/handler.py +268 -0
  29. boti_data-0.1.0/src/boti_data/filters/utils.py +449 -0
  30. boti_data-0.1.0/src/boti_data/gateway/__init__.py +13 -0
  31. boti_data-0.1.0/src/boti_data/gateway/arrow_adapters.py +337 -0
  32. boti_data-0.1.0/src/boti_data/gateway/core.py +2068 -0
  33. boti_data-0.1.0/src/boti_data/gateway/frame_strategies.py +385 -0
  34. boti_data-0.1.0/src/boti_data/gateway/loaders.py +284 -0
  35. boti_data-0.1.0/src/boti_data/gateway/normalization.py +182 -0
  36. boti_data-0.1.0/src/boti_data/gateway/requests.py +150 -0
  37. boti_data-0.1.0/src/boti_data/helper.py +199 -0
  38. boti_data-0.1.0/src/boti_data/joins.py +147 -0
  39. boti_data-0.1.0/src/boti_data/parquet/__init__.py +8 -0
  40. boti_data-0.1.0/src/boti_data/parquet/reader.py +190 -0
  41. boti_data-0.1.0/src/boti_data/parquet/resource.py +572 -0
  42. boti_data-0.1.0/src/boti_data/schema.py +245 -0
  43. boti_data-0.1.0/src/boti_data.egg-info/PKG-INFO +189 -0
  44. boti_data-0.1.0/src/boti_data.egg-info/SOURCES.txt +45 -0
  45. boti_data-0.1.0/src/boti_data.egg-info/dependency_links.txt +1 -0
  46. boti_data-0.1.0/src/boti_data.egg-info/requires.txt +10 -0
  47. boti_data-0.1.0/src/boti_data.egg-info/top_level.txt +1 -0
@@ -0,0 +1,189 @@
1
+ Metadata-Version: 2.4
2
+ Name: boti-data
3
+ Version: 0.1.0
4
+ Summary: Data infrastructure for the Boti ecosystem
5
+ Author-email: Your Name <your.email@example.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/your-username/boti-data
8
+ Project-URL: Repository, https://github.com/your-username/boti-data
9
+ Project-URL: Documentation, https://github.com/your-username/boti-data#readme
10
+ Project-URL: Issues, https://github.com/your-username/boti-data/issues
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
17
+ Requires-Python: >=3.13
18
+ Description-Content-Type: text/markdown
19
+ Requires-Dist: asyncmy>=0.2.11
20
+ Requires-Dist: boti<0.2.0,>=0.1.0
21
+ Requires-Dist: dask[dataframe,distributed]>=2026.3.0
22
+ Requires-Dist: fsspec>=2026.3.0
23
+ Requires-Dist: pandas>=3.0.2
24
+ Requires-Dist: polars>=1.29.0
25
+ Requires-Dist: pyarrow>=23.0.1
26
+ Requires-Dist: pydantic>=2.12.5
27
+ Requires-Dist: pymysql>=1.1.2
28
+ Requires-Dist: sqlalchemy[asyncio]>=2.0.49
29
+
30
+ # boti-data
31
+
32
+ `boti-data` is the **data access and data transformation layer** of the Boti ecosystem.
33
+
34
+ It builds on top of `boti` and gives teams a reusable interface for working with structured data across databases, parquet datasets, schema-controlled transformations, and distributed or partitioned loading workflows.
35
+
36
+ ## What `boti-data` is for
37
+
38
+ Many teams have the same recurring problem: business logic depends on data that lives in multiple places, arrives in slightly different shapes, and is loaded through a mix of notebooks, scripts, ad hoc SQL, and one-off helpers.
39
+
40
+ `boti-data` helps turn that into a more coherent data access layer.
41
+
42
+ It is designed for codebases that need to:
43
+
44
+ - connect to named data sources consistently
45
+ - reflect or model database tables without hand-writing everything up front
46
+ - load data through a gateway instead of bespoke query snippets everywhere
47
+ - normalise and validate schemas before downstream use
48
+ - combine parquet and database workflows in one library
49
+ - scale from simple local reads to partitioned or distributed loading
50
+
51
+ ## Problems `boti-data` solves
52
+
53
+ `boti-data` is useful when data code is suffering from issues like:
54
+
55
+ - repeated connection boilerplate across notebooks and services
56
+ - slow, fragile query code copied from place to place
57
+ - inconsistent schema assumptions between producers and consumers
58
+ - difficult transitions from exploratory analysis to reusable pipelines
59
+ - manual join and field-mapping logic repeated in many modules
60
+ - no common abstraction for loading data from SQL and parquet sources
61
+
62
+ By centralising those patterns, `boti-data` reduces duplicated plumbing and makes transformations easier to reason about.
63
+
64
+ ## Why `boti-data` can make a huge difference
65
+
66
+ The biggest benefit of `boti-data` is that it creates a **shared data interface** between infrastructure and business logic.
67
+
68
+ That means teams can spend less time rewriting access code and more time working on actual transformations, validation rules, and downstream decisions.
69
+
70
+ It can make a major difference when:
71
+
72
+ - analysts and engineers share the same source systems
73
+ - a notebook prototype needs to become production code
74
+ - multiple data products depend on the same tables or parquet layouts
75
+ - schema drift is a recurring source of errors
76
+ - large extracts need partitioning or distributed execution
77
+ - teams want a clean boundary between connection details and transformation logic
78
+
79
+ ## Domain areas where it is especially valuable
80
+
81
+ `boti-data` is intentionally general-purpose, but it is especially strong in domains where structured operational data must be transformed into reliable analytical or decision-ready datasets.
82
+
83
+ Examples include:
84
+
85
+ - **analytics engineering**: building reusable source loaders, schema maps, and standardised transformations
86
+ - **business operations**: consolidating data from transactional systems, planning tools, and operational databases
87
+ - **finance and controlling**: reconciling structured data with explicit schema expectations and repeatable joins
88
+ - **risk, compliance, and audit**: validating input shape, tracing transformations, and standardising access patterns
89
+ - **customer and product analytics**: joining behavioural and operational datasets with less custom plumbing
90
+ - **supply chain and logistics**: unifying inventory, movement, order, and status data from several systems
91
+ - **data platform and internal tooling**: giving teams a common gateway layer instead of ad hoc connectors
92
+ - **ML feature preparation**: building reliable dataset assembly steps from SQL and parquet sources
93
+
94
+ In those settings, the gains are not just convenience. They show up as better reuse, fewer integration bugs, and faster movement from exploration to production.
95
+
96
+ ## Core capabilities
97
+
98
+ - SQL database resources
99
+ - async and sync database access helpers
100
+ - SQLAlchemy model reflection and registries
101
+ - connection catalogues
102
+ - parquet resources and readers
103
+ - gateway-style loading APIs
104
+ - filter expressions
105
+ - schema normalisation and validation helpers
106
+ - field mapping and join helpers
107
+ - partitioned and distributed data workflows
108
+
109
+ ## Installation
110
+
111
+ Install directly:
112
+
113
+ ```bash
114
+ pip install boti-data
115
+ ```
116
+
117
+ Or install through the core package extra:
118
+
119
+ ```bash
120
+ pip install "boti[data]"
121
+ ```
122
+
123
+ ## Imports
124
+
125
+ `boti-data` uses the top-level Python package `boti_data`:
126
+
127
+ ```python
128
+ from boti_data import (
129
+ ConnectionCatalog,
130
+ DataGateway,
131
+ DataHelper,
132
+ FieldMap,
133
+ ParquetDataConfig,
134
+ ParquetDataResource,
135
+ SqlAlchemyModelBuilder,
136
+ SqlDatabaseConfig,
137
+ SqlDatabaseResource,
138
+ )
139
+ ```
140
+
141
+ Lower-level modules are also available:
142
+
143
+ ```python
144
+ from boti_data.db import SqlDatabaseConfig, SqlDatabaseResource
145
+ from boti_data.gateway import DataGateway
146
+ from boti_data.parquet import ParquetDataConfig, ParquetDataResource
147
+ from boti_data.schema import validate_schema
148
+ ```
149
+
150
+ ## Examples
151
+
152
+ ### SQL resource
153
+
154
+ ```python
155
+ from boti_data import SqlDatabaseConfig, SqlDatabaseResource
156
+
157
+ config = SqlDatabaseConfig(connection_url="sqlite:///example.db", query_only=True)
158
+
159
+ with SqlDatabaseResource(config) as db:
160
+ with db.session() as session:
161
+ rows = session.execute(...)
162
+ ```
163
+
164
+ ### Gateway
165
+
166
+ ```python
167
+ from boti_data import DataGateway, SqlDatabaseConfig
168
+
169
+ gateway = DataGateway(
170
+ backend="sqlalchemy",
171
+ config=SqlDatabaseConfig(connection_url="sqlite:///example.db", query_only=True),
172
+ )
173
+ ```
174
+
175
+ ## Relationship to `boti`
176
+
177
+ `boti-data` depends on `boti`, and reuses:
178
+
179
+ - logging
180
+ - resource lifecycle
181
+ - secure I/O helpers
182
+ - project/environment utilities
183
+
184
+ If you only need the runtime primitives, install `boti`.
185
+ If you need a stronger data access and transformation layer, install `boti-data` or `boti[data]`.
186
+
187
+ ## Development & Deployment
188
+
189
+ See [docs/DEPLOYMENT.md](docs/DEPLOYMENT.md) for publishing instructions.
@@ -0,0 +1,160 @@
1
+ # boti-data
2
+
3
+ `boti-data` is the **data access and data transformation layer** of the Boti ecosystem.
4
+
5
+ It builds on top of `boti` and gives teams a reusable interface for working with structured data across databases, parquet datasets, schema-controlled transformations, and distributed or partitioned loading workflows.
6
+
7
+ ## What `boti-data` is for
8
+
9
+ Many teams have the same recurring problem: business logic depends on data that lives in multiple places, arrives in slightly different shapes, and is loaded through a mix of notebooks, scripts, ad hoc SQL, and one-off helpers.
10
+
11
+ `boti-data` helps turn that into a more coherent data access layer.
12
+
13
+ It is designed for codebases that need to:
14
+
15
+ - connect to named data sources consistently
16
+ - reflect or model database tables without hand-writing everything up front
17
+ - load data through a gateway instead of bespoke query snippets everywhere
18
+ - normalise and validate schemas before downstream use
19
+ - combine parquet and database workflows in one library
20
+ - scale from simple local reads to partitioned or distributed loading
21
+
22
+ ## Problems `boti-data` solves
23
+
24
+ `boti-data` is useful when data code is suffering from issues like:
25
+
26
+ - repeated connection boilerplate across notebooks and services
27
+ - slow, fragile query code copied from place to place
28
+ - inconsistent schema assumptions between producers and consumers
29
+ - difficult transitions from exploratory analysis to reusable pipelines
30
+ - manual join and field-mapping logic repeated in many modules
31
+ - no common abstraction for loading data from SQL and parquet sources
32
+
33
+ By centralising those patterns, `boti-data` reduces duplicated plumbing and makes transformations easier to reason about.
34
+
35
+ ## Why `boti-data` can make a huge difference
36
+
37
+ The biggest benefit of `boti-data` is that it creates a **shared data interface** between infrastructure and business logic.
38
+
39
+ That means teams can spend less time rewriting access code and more time working on actual transformations, validation rules, and downstream decisions.
40
+
41
+ It can make a major difference when:
42
+
43
+ - analysts and engineers share the same source systems
44
+ - a notebook prototype needs to become production code
45
+ - multiple data products depend on the same tables or parquet layouts
46
+ - schema drift is a recurring source of errors
47
+ - large extracts need partitioning or distributed execution
48
+ - teams want a clean boundary between connection details and transformation logic
49
+
50
+ ## Domain areas where it is especially valuable
51
+
52
+ `boti-data` is intentionally general-purpose, but it is especially strong in domains where structured operational data must be transformed into reliable analytical or decision-ready datasets.
53
+
54
+ Examples include:
55
+
56
+ - **analytics engineering**: building reusable source loaders, schema maps, and standardised transformations
57
+ - **business operations**: consolidating data from transactional systems, planning tools, and operational databases
58
+ - **finance and controlling**: reconciling structured data with explicit schema expectations and repeatable joins
59
+ - **risk, compliance, and audit**: validating input shape, tracing transformations, and standardising access patterns
60
+ - **customer and product analytics**: joining behavioural and operational datasets with less custom plumbing
61
+ - **supply chain and logistics**: unifying inventory, movement, order, and status data from several systems
62
+ - **data platform and internal tooling**: giving teams a common gateway layer instead of ad hoc connectors
63
+ - **ML feature preparation**: building reliable dataset assembly steps from SQL and parquet sources
64
+
65
+ In those settings, the gains are not just convenience. They show up as better reuse, fewer integration bugs, and faster movement from exploration to production.
66
+
67
+ ## Core capabilities
68
+
69
+ - SQL database resources
70
+ - async and sync database access helpers
71
+ - SQLAlchemy model reflection and registries
72
+ - connection catalogues
73
+ - parquet resources and readers
74
+ - gateway-style loading APIs
75
+ - filter expressions
76
+ - schema normalisation and validation helpers
77
+ - field mapping and join helpers
78
+ - partitioned and distributed data workflows
79
+
80
+ ## Installation
81
+
82
+ Install directly:
83
+
84
+ ```bash
85
+ pip install boti-data
86
+ ```
87
+
88
+ Or install through the core package extra:
89
+
90
+ ```bash
91
+ pip install "boti[data]"
92
+ ```
93
+
94
+ ## Imports
95
+
96
+ `boti-data` uses the top-level Python package `boti_data`:
97
+
98
+ ```python
99
+ from boti_data import (
100
+ ConnectionCatalog,
101
+ DataGateway,
102
+ DataHelper,
103
+ FieldMap,
104
+ ParquetDataConfig,
105
+ ParquetDataResource,
106
+ SqlAlchemyModelBuilder,
107
+ SqlDatabaseConfig,
108
+ SqlDatabaseResource,
109
+ )
110
+ ```
111
+
112
+ Lower-level modules are also available:
113
+
114
+ ```python
115
+ from boti_data.db import SqlDatabaseConfig, SqlDatabaseResource
116
+ from boti_data.gateway import DataGateway
117
+ from boti_data.parquet import ParquetDataConfig, ParquetDataResource
118
+ from boti_data.schema import validate_schema
119
+ ```
120
+
121
+ ## Examples
122
+
123
+ ### SQL resource
124
+
125
+ ```python
126
+ from boti_data import SqlDatabaseConfig, SqlDatabaseResource
127
+
128
+ config = SqlDatabaseConfig(connection_url="sqlite:///example.db", query_only=True)
129
+
130
+ with SqlDatabaseResource(config) as db:
131
+ with db.session() as session:
132
+ rows = session.execute(...)
133
+ ```
134
+
135
+ ### Gateway
136
+
137
+ ```python
138
+ from boti_data import DataGateway, SqlDatabaseConfig
139
+
140
+ gateway = DataGateway(
141
+ backend="sqlalchemy",
142
+ config=SqlDatabaseConfig(connection_url="sqlite:///example.db", query_only=True),
143
+ )
144
+ ```
145
+
146
+ ## Relationship to `boti`
147
+
148
+ `boti-data` depends on `boti`, and reuses:
149
+
150
+ - logging
151
+ - resource lifecycle
152
+ - secure I/O helpers
153
+ - project/environment utilities
154
+
155
+ If you only need the runtime primitives, install `boti`.
156
+ If you need a stronger data access and transformation layer, install `boti-data` or `boti[data]`.
157
+
158
+ ## Development & Deployment
159
+
160
+ See [docs/DEPLOYMENT.md](docs/DEPLOYMENT.md) for publishing instructions.
@@ -0,0 +1,64 @@
1
+ [project]
2
+ name = "boti-data"
3
+ version = "0.1.0"
4
+ description = "Data infrastructure for the Boti ecosystem"
5
+ readme = "README.md"
6
+ requires-python = ">=3.13"
7
+ license = {text = "MIT"}
8
+ authors = [
9
+ {name = "Your Name", email = "your.email@example.com"}
10
+ ]
11
+ classifiers = [
12
+ "Development Status :: 3 - Alpha",
13
+ "Intended Audience :: Developers",
14
+ "License :: OSI Approved :: MIT License",
15
+ "Programming Language :: Python :: 3",
16
+ "Programming Language :: Python :: 3.13",
17
+ "Topic :: Software Development :: Libraries :: Python Modules",
18
+ ]
19
+ dependencies = [
20
+ "asyncmy>=0.2.11",
21
+ "boti>=0.1.0,<0.2.0",
22
+ "dask[dataframe,distributed]>=2026.3.0",
23
+ "fsspec>=2026.3.0",
24
+ "pandas>=3.0.2",
25
+ "polars>=1.29.0",
26
+ "pyarrow>=23.0.1",
27
+ "pydantic>=2.12.5",
28
+ "pymysql>=1.1.2",
29
+ "sqlalchemy[asyncio]>=2.0.49",
30
+ ]
31
+
32
+ [project.urls]
33
+ Homepage = "https://github.com/your-username/boti-data"
34
+ Repository = "https://github.com/your-username/boti-data"
35
+ Documentation = "https://github.com/your-username/boti-data#readme"
36
+ Issues = "https://github.com/your-username/boti-data/issues"
37
+
38
+ [build-system]
39
+ requires = ["setuptools>=80", "wheel"]
40
+ build-backend = "setuptools.build_meta"
41
+
42
+ [dependency-groups]
43
+ dev = [
44
+ "pytest>=9.0.3",
45
+ "pytest-asyncio>=1.3.0",
46
+ ]
47
+
48
+ [tool.uv]
49
+ publish.token = {env = "UV_PUBLISH_TOKEN"}
50
+
51
+ [tool.pytest.ini_options]
52
+ markers = [
53
+ "security_regression: focused regression coverage for security fixes and audit findings",
54
+ ]
55
+
56
+ [tool.setuptools]
57
+ include-package-data = false
58
+
59
+ [tool.setuptools.package-dir]
60
+ "" = "src"
61
+
62
+ [tool.setuptools.packages.find]
63
+ where = ["src"]
64
+ include = ["boti_data*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,92 @@
1
+ """
2
+ Data modules and interfaces for the Boti pipeline context.
3
+ """
4
+
5
+ from boti_data.db import (
6
+ AsyncSqlDatabaseResource,
7
+ BuilderConfig,
8
+ DefaultBase,
9
+ EngineRegistry,
10
+ RegistryConfig,
11
+ SqlAlchemyModelBuilder,
12
+ SqlDatabaseConfig,
13
+ SqlDatabaseResource,
14
+ SqlPartitionPlan,
15
+ SqlPartitionSpec,
16
+ SqlPartitionedLoadRequest,
17
+ SqlPartitionedLoader,
18
+ SqlModelRegistry,
19
+ ensure_greenlet_available,
20
+ get_global_registry,
21
+ )
22
+ from boti_data.connection_catalog import ConnectionCatalog
23
+ from boti_data.parquet import ParquetDataConfig, ParquetDataResource, ParquetReader
24
+ from boti_data.filters import (
25
+ FilterHandler,
26
+ Expr,
27
+ TrueExpr,
28
+ And,
29
+ Or,
30
+ Not,
31
+ )
32
+ from boti_data.gateway import DataGateway, ParquetLoadRequest, SqlLoadRequest
33
+ from boti_data.helper import DataHelper
34
+ from boti_data.field_map import FieldMap
35
+ from boti_data.distributed import DaskSession, dask_session
36
+ from boti_data.gateway import DataFrameOptions, DataFrameParams
37
+ from boti_data.joins import indexed_left_join, left_join_frames
38
+ from boti_data.schema import (
39
+ SchemaValidationError,
40
+ align_frames_for_join,
41
+ apply_schema_map,
42
+ infer_schema_map,
43
+ normalize_dtype_alias,
44
+ normalize_schema_map,
45
+ validate_schema,
46
+ )
47
+
48
+ __all__ = [
49
+ "And",
50
+ "AsyncSqlDatabaseResource",
51
+ "BuilderConfig",
52
+ "ConnectionCatalog",
53
+ "DataFrameOptions",
54
+ "DataFrameParams",
55
+ "DataGateway",
56
+ "DataHelper",
57
+ "DaskSession",
58
+ "DefaultBase",
59
+ "EngineRegistry",
60
+ "Expr",
61
+ "FieldMap",
62
+ "FilterHandler",
63
+ "indexed_left_join",
64
+ "Not",
65
+ "Or",
66
+ "ParquetDataConfig",
67
+ "ParquetLoadRequest",
68
+ "ParquetDataResource",
69
+ "ParquetReader",
70
+ "RegistryConfig",
71
+ "SchemaValidationError",
72
+ "SqlLoadRequest",
73
+ "SqlAlchemyModelBuilder",
74
+ "SqlDatabaseConfig",
75
+ "SqlDatabaseResource",
76
+ "SqlPartitionPlan",
77
+ "SqlPartitionSpec",
78
+ "SqlPartitionedLoadRequest",
79
+ "SqlPartitionedLoader",
80
+ "SqlModelRegistry",
81
+ "TrueExpr",
82
+ "align_frames_for_join",
83
+ "apply_schema_map",
84
+ "ensure_greenlet_available",
85
+ "get_global_registry",
86
+ "infer_schema_map",
87
+ "dask_session",
88
+ "left_join_frames",
89
+ "normalize_dtype_alias",
90
+ "normalize_schema_map",
91
+ "validate_schema",
92
+ ]