datablade 0.0.5__py3-none-any.whl → 0.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datablade/__init__.py +10 -2
- datablade/blade.py +174 -5
- datablade/dataframes/__init__.py +8 -0
- datablade/dataframes/frames.py +127 -27
- datablade/dataframes/readers.py +988 -161
- datablade/docs/ARCHITECTURE.md +102 -0
- datablade/docs/OBJECT_REGISTRY.md +194 -0
- datablade/docs/README.md +57 -0
- datablade/docs/TESTING.md +37 -0
- datablade/docs/USAGE.md +409 -0
- datablade/docs/__init__.py +87 -0
- datablade/docs/__main__.py +6 -0
- datablade/io/json.py +45 -8
- datablade/io/zip.py +68 -30
- datablade/registry.py +581 -0
- datablade/sql/__init__.py +25 -1
- datablade/sql/bulk_load.py +309 -49
- datablade/sql/ddl.py +201 -26
- datablade/sql/ddl_pyarrow.py +150 -26
- datablade/sql/dialects.py +2 -0
- datablade/sql/quoting.py +2 -0
- datablade/sql/schema_spec.py +65 -0
- datablade/sql/sqlserver.py +390 -0
- datablade/utils/__init__.py +2 -1
- datablade/utils/lists.py +3 -0
- datablade/utils/logging.py +46 -1
- datablade/utils/strings.py +180 -17
- {datablade-0.0.5.dist-info → datablade-0.0.6.dist-info}/METADATA +68 -13
- datablade-0.0.6.dist-info/RECORD +41 -0
- {datablade-0.0.5.dist-info → datablade-0.0.6.dist-info}/WHEEL +1 -1
- datablade-0.0.5.dist-info/RECORD +0 -31
- {datablade-0.0.5.dist-info → datablade-0.0.6.dist-info}/licenses/LICENSE +0 -0
- {datablade-0.0.5.dist-info → datablade-0.0.6.dist-info}/top_level.txt +0 -0
datablade/docs/USAGE.md
ADDED
|
@@ -0,0 +1,409 @@
|
|
|
1
|
+
# datablade Usage Guide
|
|
2
|
+
|
|
3
|
+
## Installation
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
pip install datablade
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
Optional extras:
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install "datablade[performance]"
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## File reading
|
|
16
|
+
|
|
17
|
+
### File format support matrix
|
|
18
|
+
|
|
19
|
+
| File type | Streaming (iterator / Parquet streaming) | Materialized (read_file_smart) | Engine / SQL notes |
|
|
20
|
+
| --- | --- | --- | --- |
|
|
21
|
+
| `.csv`, `.tsv`, `.txt` | ✅ | ✅ | pandas by default; Polars `return_type="polars"` or `"polars_lazy"` (scan for large inputs) |
|
|
22
|
+
| `.parquet` | ✅ | ✅ | pandas by default; Polars `return_type="polars"` or `"polars_lazy"`; SQL DDL from schema via `generate_create_table_from_parquet` |
|
|
23
|
+
| JSON Lines (`.json` with `lines=True` or `.jsonl`) | ✅ | ✅ | pandas by default; Polars via `scan_ndjson` when `lines=True` |
|
|
24
|
+
| JSON (non-lines) | ❌ | ✅ | pandas only (materialized) |
|
|
25
|
+
| `.xlsx`, `.xls` | ✅ (read-only, `openpyxl`) | ✅ | pandas only |
|
|
26
|
+
|
|
27
|
+
Notes:
|
|
28
|
+
|
|
29
|
+
- **JSON Lines requirement:** streaming JSON requires JSON Lines (`lines=True`). Non-lines JSON only materializes.
|
|
30
|
+
- **Excel limitations:** streaming Excel relies on `openpyxl` read-only mode; without it, Excel files fall back to full reads and must fit in `memory_fraction`.
|
|
31
|
+
- **Polars return options:** use `return_type="polars"` for a Polars `DataFrame`, and `return_type="polars_lazy"` for a `LazyFrame` when scanning is supported.
|
|
32
|
+
|
|
33
|
+
### Read into a DataFrame (convenience)
|
|
34
|
+
|
|
35
|
+
Use this when you want a single in-memory DataFrame.
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
from datablade.dataframes import read_file_smart
|
|
39
|
+
|
|
40
|
+
df = read_file_smart("data.csv", verbose=True)
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
`read_file_smart()` may read in chunks internally, but it returns a single DataFrame by default.
|
|
44
|
+
If you want a non-materializing return type, set `return_type`:
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
from datablade.dataframes import read_file_smart
|
|
48
|
+
|
|
49
|
+
# Iterator of pandas DataFrames (no concat)
|
|
50
|
+
chunks = read_file_smart("huge.csv", return_type="iterator", memory_fraction=0.3)
|
|
51
|
+
for chunk in chunks:
|
|
52
|
+
process(chunk)
|
|
53
|
+
|
|
54
|
+
# Polars for large files (LazyFrame for large inputs, DataFrame for small ones)
|
|
55
|
+
polars_obj = read_file_smart("huge.csv", return_type="polars")
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
#### Polars return types
|
|
59
|
+
|
|
60
|
+
Use `return_type="polars"` (or `return_polars=True`) to get a Polars `DataFrame`. Use
|
|
61
|
+
`return_type="polars_lazy"` for a lazy `LazyFrame` when the format supports Polars
|
|
62
|
+
scanning.
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
from datablade.dataframes import read_file_smart
|
|
66
|
+
|
|
67
|
+
df_polars = read_file_smart("large.csv", return_type="polars", verbose=True)
|
|
68
|
+
lf_polars = read_file_smart("large.csv", return_type="polars_lazy", verbose=True)
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Notes:
|
|
72
|
+
|
|
73
|
+
- For CSV/TSV/TXT and Parquet files, Polars uses `scan_*` for large inputs.
|
|
74
|
+
- When a format is not supported by Polars scanning, datablade reads with pandas and
|
|
75
|
+
converts to Polars.
|
|
76
|
+
- For very large files, the pandas chunk-concat fallback can temporarily spike memory
|
|
77
|
+
usage. Use `read_file_iter()` or `return_type="iterator"` to avoid concatenation.
|
|
78
|
+
|
|
79
|
+
### Stream without materializing (recommended for very large files)
|
|
80
|
+
|
|
81
|
+
Use this when you want to process arbitrarily large files without ever concatenating the full dataset.
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
from datablade.dataframes import read_file_iter
|
|
85
|
+
|
|
86
|
+
for chunk in read_file_iter("huge.csv", memory_fraction=0.3, verbose=True):
|
|
87
|
+
process(chunk)
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
If you already use `read_file_smart()`, you can request the same behavior:
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
from datablade.dataframes import read_file_smart
|
|
94
|
+
|
|
95
|
+
for chunk in read_file_smart("huge.csv", return_type="iterator"):
|
|
96
|
+
process(chunk)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Supported streaming formats:
|
|
100
|
+
|
|
101
|
+
- `.csv`, `.tsv`, `.txt`
|
|
102
|
+
- `.parquet`
|
|
103
|
+
- `.json` (JSON Lines with `lines=True`, or standard JSON arrays with `ijson`)
|
|
104
|
+
- `.xlsx`, `.xls` when `openpyxl` is installed (read-only streaming)
|
|
105
|
+
|
|
106
|
+
JSON streaming notes:
|
|
107
|
+
|
|
108
|
+
- Standard JSON streaming requires the optional `ijson` dependency.
|
|
109
|
+
- If your JSON is nested, pass `record_path="item"` (or another path) to
|
|
110
|
+
`read_file_iter()` or `json_to_jsonl()`.
|
|
111
|
+
- Use `json_to_jsonl()` to convert a standard JSON file into JSON Lines when
|
|
112
|
+
`lines=True` streaming is needed.
|
|
113
|
+
|
|
114
|
+
Notes for Excel streaming:
|
|
115
|
+
|
|
116
|
+
- Requires `openpyxl` and reads in read-only mode.
|
|
117
|
+
- Use `chunksize` (or `rows_per_file` for `excel_to_parquets`) to control chunking.
|
|
118
|
+
- If `openpyxl` is unavailable, Excel files fall back to full reads and must fit in
|
|
119
|
+
`memory_fraction` of available memory.
|
|
120
|
+
|
|
121
|
+
### Stream to Parquet partitions
|
|
122
|
+
|
|
123
|
+
If your goal is “partition to Parquet without ever materializing”, use `stream_to_parquets()`.
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
from datablade.dataframes import excel_to_parquets, stream_to_parquets
|
|
127
|
+
|
|
128
|
+
files = stream_to_parquets(
|
|
129
|
+
"huge.csv",
|
|
130
|
+
output_dir="partitioned/",
|
|
131
|
+
rows_per_file=200_000,
|
|
132
|
+
convert_types=True,
|
|
133
|
+
verbose=True,
|
|
134
|
+
)
|
|
135
|
+
print(len(files))
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
Excel to Parquet streaming:
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
files = excel_to_parquets(
|
|
142
|
+
"large.xlsx",
|
|
143
|
+
output_dir="partitioned_excel/",
|
|
144
|
+
rows_per_file=100_000,
|
|
145
|
+
convert_types=True,
|
|
146
|
+
verbose=True,
|
|
147
|
+
)
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
If you prefer the older helper that may choose chunk sizes automatically, you can also use `read_file_to_parquets()`.
|
|
151
|
+
|
|
152
|
+
If you already have an iterator (for example from `read_file_iter()`), you can send it to a sink incrementally:
|
|
153
|
+
|
|
154
|
+
```python
|
|
155
|
+
from datablade.dataframes import read_file_iter, stream_to_sink
|
|
156
|
+
|
|
157
|
+
chunks = read_file_iter("huge.csv", memory_fraction=0.3)
|
|
158
|
+
files = stream_to_sink(chunks, output_dir="partitioned/")
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## DataFrame helpers
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
from datablade.dataframes import clean_dataframe_columns, try_cast_string_columns_to_numeric
|
|
165
|
+
|
|
166
|
+
# clean_dataframe_columns:
|
|
167
|
+
# - flattens MultiIndex columns
|
|
168
|
+
# - coerces names to strings
|
|
169
|
+
# - drops duplicate columns (keeps first)
|
|
170
|
+
|
|
171
|
+
# try_cast_string_columns_to_numeric:
|
|
172
|
+
# - converts object columns containing numeric strings
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
## SQL helpers
|
|
176
|
+
|
|
177
|
+
```python
|
|
178
|
+
import pandas as pd
|
|
179
|
+
from datablade.sql import Dialect, generate_create_table, generate_create_table_from_parquet
|
|
180
|
+
|
|
181
|
+
df = pd.DataFrame({"id": [1, 2], "name": ["a", "b"]})
|
|
182
|
+
print(generate_create_table(df, table="t", dialect=Dialect.POSTGRES))
|
|
183
|
+
|
|
184
|
+
# SQL Server batch separator (optional, only when catalog is supplied)
|
|
185
|
+
print(
|
|
186
|
+
generate_create_table(
|
|
187
|
+
df,
|
|
188
|
+
catalog="db",
|
|
189
|
+
table="t",
|
|
190
|
+
dialect=Dialect.SQLSERVER,
|
|
191
|
+
use_go=True,
|
|
192
|
+
)
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# Schema-only Parquet -> SQL (does not read/scan the data)
|
|
196
|
+
ddl = generate_create_table_from_parquet(
|
|
197
|
+
"events.parquet",
|
|
198
|
+
table="events",
|
|
199
|
+
dialect=Dialect.POSTGRES,
|
|
200
|
+
)
|
|
201
|
+
print(ddl)
|
|
202
|
+
|
|
203
|
+
# Opt-in fallback for complex types (struct/list/map/union) to JSON text columns.
|
|
204
|
+
ddl, metadata = generate_create_table_from_parquet(
|
|
205
|
+
"events.parquet",
|
|
206
|
+
table="events",
|
|
207
|
+
dialect=Dialect.POSTGRES,
|
|
208
|
+
fallback_to_json=True,
|
|
209
|
+
return_metadata=True,
|
|
210
|
+
)
|
|
211
|
+
print(metadata.fallback_columns)
|
|
212
|
+
|
|
213
|
+
# Column-level overrides (externalized schema controls)
|
|
214
|
+
schema_spec = {
|
|
215
|
+
"defaults": {
|
|
216
|
+
"string": {
|
|
217
|
+
"prefer_length": "estimate",
|
|
218
|
+
"defined_pad": 2,
|
|
219
|
+
"min_length": 1,
|
|
220
|
+
"max_length": 200,
|
|
221
|
+
"empty_as_null": True,
|
|
222
|
+
}
|
|
223
|
+
},
|
|
224
|
+
"columns": {
|
|
225
|
+
"notes": {"sql_type": "nvarchar(max)"},
|
|
226
|
+
"code": {"string": {"max_length": 10}},
|
|
227
|
+
"legacy_id": {"sql_type": "varchar(32)", "nullable": True},
|
|
228
|
+
},
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
ddl = generate_create_table(
|
|
232
|
+
df,
|
|
233
|
+
table="t",
|
|
234
|
+
dialect=Dialect.SQLSERVER,
|
|
235
|
+
schema_spec=schema_spec,
|
|
236
|
+
)
|
|
237
|
+
print(ddl)
|
|
238
|
+
|
|
239
|
+
# For Parquet DDL, schema_spec can override sql_type/nullable per column.
|
|
240
|
+
ddl = generate_create_table_from_parquet(
|
|
241
|
+
"events.parquet",
|
|
242
|
+
table="events",
|
|
243
|
+
dialect=Dialect.SQLSERVER,
|
|
244
|
+
schema_spec={"columns": {"legacy_id": {"sql_type": "varchar(32)", "nullable": True}}},
|
|
245
|
+
)
|
|
246
|
+
print(ddl)
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
Notes:
|
|
250
|
+
|
|
251
|
+
- Parquet DDL generation reads only the file schema via PyArrow (no DataFrame materialization).
|
|
252
|
+
- By default, columns with no clean mapping (for example structs/lists/maps/unions) are dropped and a
|
|
253
|
+
warning is logged under logger name `datablade`.
|
|
254
|
+
- Set `fallback_to_json=True` to keep complex types by mapping them to text columns
|
|
255
|
+
(NVARCHAR/TEXT/VARCHAR, depending on dialect) intended for JSON-encoded values.
|
|
256
|
+
- Use `return_metadata=True` to receive a metadata object listing dropped columns and any
|
|
257
|
+
fallback-mapped columns.
|
|
258
|
+
- `schema_spec` lets you override column types or string sizing. When omitted, the
|
|
259
|
+
existing inference behavior is unchanged.
|
|
260
|
+
|
|
261
|
+
### SQL Server helpers
|
|
262
|
+
|
|
263
|
+
Open a partitioned Parquet directory via OPENROWSET:
|
|
264
|
+
|
|
265
|
+
```python
|
|
266
|
+
from datablade.sql import sqlserver_openrowset_parquet
|
|
267
|
+
|
|
268
|
+
sql = sqlserver_openrowset_parquet(
|
|
269
|
+
"C:/data/parts/*.parquet",
|
|
270
|
+
data_source="MyExternalDataSource",
|
|
271
|
+
table_alias="p",
|
|
272
|
+
)
|
|
273
|
+
print(sql)
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
Create a table from Parquet schema and generate BULK INSERT statements for
|
|
277
|
+
memory-aware Parquet-to-CSV staging:
|
|
278
|
+
|
|
279
|
+
```python
|
|
280
|
+
from datablade.sql import sqlserver_create_and_insert_from_parquet
|
|
281
|
+
|
|
282
|
+
sql, csv_files = sqlserver_create_and_insert_from_parquet(
|
|
283
|
+
"events.parquet",
|
|
284
|
+
output_dir="staging_csv",
|
|
285
|
+
table="events",
|
|
286
|
+
rows_per_file=200_000,
|
|
287
|
+
)
|
|
288
|
+
print(sql)
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
Stage multiple Parquet files and load via BCP:
|
|
292
|
+
|
|
293
|
+
```python
|
|
294
|
+
from datablade.sql import (
|
|
295
|
+
bulk_load_sqlserver_commands,
|
|
296
|
+
bulk_load_sqlserver_many,
|
|
297
|
+
sqlserver_create_and_stage_from_parquets,
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
ddl, csv_files = sqlserver_create_and_stage_from_parquets(
|
|
301
|
+
["events_part1.parquet", "events_part2.parquet"],
|
|
302
|
+
output_dir="staging_csv",
|
|
303
|
+
table="events",
|
|
304
|
+
schema_strict=True, # raise on schema drift across Parquet files
|
|
305
|
+
)
|
|
306
|
+
print(ddl)
|
|
307
|
+
|
|
308
|
+
# Execute the DDL in SQL Server, then bulk load the staged CSVs via BCP.
|
|
309
|
+
bulk_load_sqlserver_many(
|
|
310
|
+
csv_files,
|
|
311
|
+
table_name="events",
|
|
312
|
+
database="db",
|
|
313
|
+
server="localhost",
|
|
314
|
+
username="user",
|
|
315
|
+
password="pass",
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
# Or build the BCP command strings instead of executing:
|
|
319
|
+
commands = bulk_load_sqlserver_commands(
|
|
320
|
+
csv_files,
|
|
321
|
+
table_name="events",
|
|
322
|
+
database="db",
|
|
323
|
+
server="localhost",
|
|
324
|
+
username="user",
|
|
325
|
+
password="pass",
|
|
326
|
+
redact_password=True,
|
|
327
|
+
)
|
|
328
|
+
print(commands)
|
|
329
|
+
```
|
|
330
|
+
|
|
331
|
+
Notes:
|
|
332
|
+
|
|
333
|
+
- SQL Server bulk loading uses the `bcp` CLI. Ensure the SQL Server command line
|
|
334
|
+
utilities are installed and `bcp` is on your PATH.
|
|
335
|
+
- When using `-U`/`-P`, the password is passed to the `bcp` process arguments
|
|
336
|
+
(even though logs are redacted). Prefer trusted connection (`-T`) or Azure AD
|
|
337
|
+
(`-G`) where possible.
|
|
338
|
+
|
|
339
|
+
## IO helpers
|
|
340
|
+
|
|
341
|
+
```python
|
|
342
|
+
from datablade.io import get_json, get_zip
|
|
343
|
+
|
|
344
|
+
data = get_json("https://api.example.com/data.json")
|
|
345
|
+
get_zip("https://example.com/data.zip", path="./data")
|
|
346
|
+
```
|
|
347
|
+
|
|
348
|
+
## Logging
|
|
349
|
+
|
|
350
|
+
datablade uses the standard Python logging system under the logger name `datablade`.
|
|
351
|
+
|
|
352
|
+
```python
|
|
353
|
+
import logging
|
|
354
|
+
from datablade.utils import configure_logging
|
|
355
|
+
|
|
356
|
+
configure_logging(level=logging.INFO)
|
|
357
|
+
```
|
|
358
|
+
|
|
359
|
+
Write logs to a file:
|
|
360
|
+
|
|
361
|
+
```python
|
|
362
|
+
import logging
|
|
363
|
+
from datablade.utils import configure_logging
|
|
364
|
+
|
|
365
|
+
configure_logging(level=logging.INFO, log_file="pipeline.log")
|
|
366
|
+
```
|
|
367
|
+
|
|
368
|
+
For rotation, pass a custom handler (e.g., `logging.handlers.RotatingFileHandler`).
|
|
369
|
+
|
|
370
|
+
## Path handling
|
|
371
|
+
|
|
372
|
+
Most APIs accept `str` or `pathlib.Path` for file paths.
|
|
373
|
+
|
|
374
|
+
On case-insensitive filesystems, datablade can warn when a path's casing does not
|
|
375
|
+
match the on-disk entry. To treat case mismatches as errors, enable strict mode:
|
|
376
|
+
|
|
377
|
+
```python
|
|
378
|
+
from datablade.utils import configure_paths
|
|
379
|
+
|
|
380
|
+
configure_paths(path_strict=True)
|
|
381
|
+
```
|
|
382
|
+
|
|
383
|
+
## Backward compatibility
|
|
384
|
+
|
|
385
|
+
Legacy imports under `datablade.core.*` remain available.
|
|
386
|
+
New code should prefer `datablade.dataframes`, `datablade.sql`, `datablade.io`, and `datablade.utils`.
|
|
387
|
+
|
|
388
|
+
## Optional facade (class-style)
|
|
389
|
+
|
|
390
|
+
The primary API is module-level functions, but datablade also provides an optional convenience facade for users who prefer an object-style entrypoint with shared defaults.
|
|
391
|
+
|
|
392
|
+
```python
|
|
393
|
+
from datablade import Blade
|
|
394
|
+
from datablade.sql import Dialect
|
|
395
|
+
|
|
396
|
+
blade = Blade(memory_fraction=0.3, verbose=True, convert_types=True)
|
|
397
|
+
|
|
398
|
+
for chunk in blade.iter("huge.csv"):
|
|
399
|
+
...
|
|
400
|
+
|
|
401
|
+
files = blade.stream_to_parquets("huge.csv", output_dir="partitioned/")
|
|
402
|
+
|
|
403
|
+
# Generate DDL (CREATE TABLE) for a dialect
|
|
404
|
+
ddl = blade.create_table_sql(
|
|
405
|
+
df,
|
|
406
|
+
table="my_table",
|
|
407
|
+
dialect=Dialect.POSTGRES,
|
|
408
|
+
)
|
|
409
|
+
```
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
from importlib import resources
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def list_docs() -> list[str]:
|
|
9
|
+
files = []
|
|
10
|
+
for entry in resources.files(__package__).iterdir():
|
|
11
|
+
if entry.suffix == ".md":
|
|
12
|
+
files.append(entry.stem)
|
|
13
|
+
return sorted(files)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def read_doc(name: str) -> str:
|
|
17
|
+
if not name:
|
|
18
|
+
raise ValueError("Document name is required.")
|
|
19
|
+
filename = name if name.lower().endswith(".md") else f"{name}.md"
|
|
20
|
+
doc_path = resources.files(__package__) / filename
|
|
21
|
+
if not doc_path.is_file():
|
|
22
|
+
available = ", ".join(list_docs()) or "none"
|
|
23
|
+
raise FileNotFoundError(f"Unknown document '{name}'. Available: {available}.")
|
|
24
|
+
return doc_path.read_text(encoding="utf-8")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def write_docs(target_dir: str | Path) -> list[Path]:
|
|
28
|
+
target = Path(target_dir).expanduser().resolve()
|
|
29
|
+
target.mkdir(parents=True, exist_ok=True)
|
|
30
|
+
written: list[Path] = []
|
|
31
|
+
for entry in resources.files(__package__).iterdir():
|
|
32
|
+
if entry.suffix != ".md":
|
|
33
|
+
continue
|
|
34
|
+
content = entry.read_text(encoding="utf-8")
|
|
35
|
+
destination = target / entry.name
|
|
36
|
+
destination.write_text(content, encoding="utf-8")
|
|
37
|
+
written.append(destination)
|
|
38
|
+
return written
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _build_parser() -> argparse.ArgumentParser:
|
|
42
|
+
parser = argparse.ArgumentParser(
|
|
43
|
+
description="Read datablade documentation bundled with the package."
|
|
44
|
+
)
|
|
45
|
+
parser.add_argument(
|
|
46
|
+
"--list",
|
|
47
|
+
action="store_true",
|
|
48
|
+
help="List available documentation files.",
|
|
49
|
+
)
|
|
50
|
+
parser.add_argument(
|
|
51
|
+
"--show",
|
|
52
|
+
metavar="NAME",
|
|
53
|
+
help="Print a documentation file to stdout (e.g. USAGE, ARCHITECTURE).",
|
|
54
|
+
)
|
|
55
|
+
parser.add_argument(
|
|
56
|
+
"--write-dir",
|
|
57
|
+
metavar="PATH",
|
|
58
|
+
help="Write all documentation files to a directory.",
|
|
59
|
+
)
|
|
60
|
+
return parser
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def main(argv: list[str] | None = None) -> int:
|
|
64
|
+
parser = _build_parser()
|
|
65
|
+
args = parser.parse_args(argv)
|
|
66
|
+
|
|
67
|
+
if args.list:
|
|
68
|
+
for name in list_docs():
|
|
69
|
+
print(name)
|
|
70
|
+
return 0
|
|
71
|
+
|
|
72
|
+
if args.show:
|
|
73
|
+
print(read_doc(args.show))
|
|
74
|
+
return 0
|
|
75
|
+
|
|
76
|
+
if args.write_dir:
|
|
77
|
+
written = write_docs(args.write_dir)
|
|
78
|
+
for path in written:
|
|
79
|
+
print(path)
|
|
80
|
+
return 0
|
|
81
|
+
|
|
82
|
+
parser.print_help()
|
|
83
|
+
return 0
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
if __name__ == "__main__":
|
|
87
|
+
raise SystemExit(main())
|
datablade/io/json.py
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
"""HTTP helpers for retrieving JSON payloads."""
|
|
2
|
+
|
|
3
|
+
import time
|
|
1
4
|
from typing import Any, Dict
|
|
2
5
|
|
|
3
6
|
import requests
|
|
@@ -5,13 +8,23 @@ import requests
|
|
|
5
8
|
from ..utils.messages import print_verbose
|
|
6
9
|
|
|
7
10
|
|
|
8
|
-
def get(
|
|
11
|
+
def get(
|
|
12
|
+
url: str,
|
|
13
|
+
verbose: bool = False,
|
|
14
|
+
retries: int = 3,
|
|
15
|
+
backoff_factor: float = 0.5,
|
|
16
|
+
timeout: float | None = 10.0,
|
|
17
|
+
**kwargs: Any,
|
|
18
|
+
) -> Dict[str, Any]:
|
|
9
19
|
"""
|
|
10
20
|
Get JSON data from a URL using HTTP GET request.
|
|
11
21
|
|
|
12
22
|
Args:
|
|
13
23
|
url: The URL to fetch JSON data from (must be non-empty string).
|
|
14
24
|
verbose: If True, prints error messages.
|
|
25
|
+
retries: Number of retry attempts after the initial request.
|
|
26
|
+
backoff_factor: Backoff factor for exponential retry delay.
|
|
27
|
+
timeout: Optional timeout (in seconds) passed to requests.get().
|
|
15
28
|
**kwargs: Additional keyword arguments passed to requests.get().
|
|
16
29
|
|
|
17
30
|
Returns:
|
|
@@ -24,10 +37,34 @@ def get(url: str, verbose: bool = False, **kwargs: Any) -> Dict[str, Any]:
|
|
|
24
37
|
if not isinstance(url, str) or not url.strip():
|
|
25
38
|
raise ValueError("url must be a non-empty string")
|
|
26
39
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
40
|
+
if retries < 0:
|
|
41
|
+
raise ValueError("retries must be >= 0")
|
|
42
|
+
if backoff_factor < 0:
|
|
43
|
+
raise ValueError("backoff_factor must be >= 0")
|
|
44
|
+
|
|
45
|
+
if "timeout" not in kwargs and timeout is not None:
|
|
46
|
+
kwargs["timeout"] = timeout
|
|
47
|
+
|
|
48
|
+
attempts = retries + 1
|
|
49
|
+
for attempt in range(1, attempts + 1):
|
|
50
|
+
try:
|
|
51
|
+
# Delegates HTTP details to requests; callers can pass headers/etc.
|
|
52
|
+
response = requests.get(url, **kwargs)
|
|
53
|
+
response.raise_for_status() # Raise exception for bad status codes
|
|
54
|
+
return response.json()
|
|
55
|
+
except requests.exceptions.RequestException as e:
|
|
56
|
+
message = (
|
|
57
|
+
f"Error fetching JSON from {url} "
|
|
58
|
+
f"(attempt {attempt}/{attempts}): {e}"
|
|
59
|
+
)
|
|
60
|
+
print_verbose(message, verbose=verbose)
|
|
61
|
+
if attempt >= attempts:
|
|
62
|
+
raise requests.RequestException(
|
|
63
|
+
f"Failed to fetch JSON from {url} after {attempts} attempts."
|
|
64
|
+
) from e
|
|
65
|
+
sleep_time = backoff_factor * (2 ** (attempt - 1))
|
|
66
|
+
if sleep_time:
|
|
67
|
+
time.sleep(sleep_time)
|
|
68
|
+
raise requests.RequestException(
|
|
69
|
+
f"Failed to fetch JSON from {url} after {attempts} attempts."
|
|
70
|
+
)
|
datablade/io/zip.py
CHANGED
|
@@ -1,17 +1,24 @@
|
|
|
1
|
+
"""HTTP helper for downloading and extracting ZIP archives."""
|
|
2
|
+
|
|
1
3
|
import io
|
|
2
4
|
import pathlib
|
|
5
|
+
import time
|
|
3
6
|
import zipfile
|
|
4
7
|
from typing import Any, Optional
|
|
5
8
|
|
|
6
9
|
import requests
|
|
7
10
|
|
|
8
11
|
from ..utils.messages import print_verbose
|
|
12
|
+
from ..utils.strings import ensure_directory
|
|
9
13
|
|
|
10
14
|
|
|
11
15
|
def get(
|
|
12
16
|
url: str,
|
|
13
17
|
path: Optional[str | pathlib.Path] = None,
|
|
14
18
|
verbose: bool = False,
|
|
19
|
+
retries: int = 3,
|
|
20
|
+
backoff_factor: float = 0.5,
|
|
21
|
+
timeout: float | None = 30.0,
|
|
15
22
|
**kwargs: Any,
|
|
16
23
|
) -> Optional[io.BytesIO]:
|
|
17
24
|
"""
|
|
@@ -23,6 +30,9 @@ def get(
|
|
|
23
30
|
If None, returns the ZIP data as a BytesIO object.
|
|
24
31
|
If provided, extracts all files to the specified path.
|
|
25
32
|
verbose: If True, prints progress messages.
|
|
33
|
+
retries: Number of retry attempts after the initial request.
|
|
34
|
+
backoff_factor: Backoff factor for exponential retry delay.
|
|
35
|
+
timeout: Optional timeout (in seconds) passed to requests.get().
|
|
26
36
|
**kwargs: Additional keyword arguments passed to requests.get().
|
|
27
37
|
|
|
28
38
|
Returns:
|
|
@@ -39,35 +49,63 @@ def get(
|
|
|
39
49
|
if not isinstance(url, str) or not url.strip():
|
|
40
50
|
raise ValueError("url must be a non-empty string")
|
|
41
51
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
52
|
+
if retries < 0:
|
|
53
|
+
raise ValueError("retries must be >= 0")
|
|
54
|
+
if backoff_factor < 0:
|
|
55
|
+
raise ValueError("backoff_factor must be >= 0")
|
|
56
|
+
|
|
57
|
+
if "timeout" not in kwargs and timeout is not None:
|
|
58
|
+
kwargs["timeout"] = timeout
|
|
59
|
+
|
|
60
|
+
attempts = retries + 1
|
|
61
|
+
for attempt in range(1, attempts + 1):
|
|
62
|
+
try:
|
|
63
|
+
print_verbose(f"Downloading {url}", verbose=verbose)
|
|
64
|
+
response = requests.get(url, **kwargs)
|
|
65
|
+
response.raise_for_status() # Raise exception for bad status codes
|
|
66
|
+
# Keep data in-memory to allow optional extraction or return BytesIO.
|
|
67
|
+
data = response.content
|
|
68
|
+
zip_buffer = io.BytesIO(data)
|
|
69
|
+
|
|
70
|
+
if path is None:
|
|
71
|
+
return zip_buffer
|
|
48
72
|
|
|
49
|
-
|
|
50
|
-
|
|
73
|
+
print_verbose(f"Saving data to {path}", verbose=verbose)
|
|
74
|
+
zip_buffer.seek(0)
|
|
75
|
+
with zipfile.ZipFile(zip_buffer, "r") as zip_ref:
|
|
76
|
+
# Unlike utils.strings.pathing(), extracting should work even if the
|
|
77
|
+
# destination directory doesn't exist yet.
|
|
78
|
+
extract_root = ensure_directory(path, verbose=verbose, label="path")
|
|
79
|
+
for zip_info in zip_ref.infolist():
|
|
80
|
+
extract_path = extract_root / zip_info.filename
|
|
81
|
+
ensure_directory(
|
|
82
|
+
extract_path.parent,
|
|
83
|
+
verbose=verbose,
|
|
84
|
+
label="path",
|
|
85
|
+
)
|
|
86
|
+
with open(extract_path, "wb") as f:
|
|
87
|
+
f.write(zip_ref.read(zip_info.filename))
|
|
88
|
+
return None
|
|
89
|
+
except requests.exceptions.RequestException as e:
|
|
90
|
+
message = (
|
|
91
|
+
f"Error downloading ZIP from {url} "
|
|
92
|
+
f"(attempt {attempt}/{attempts}): {e}"
|
|
93
|
+
)
|
|
94
|
+
print_verbose(message, verbose=verbose)
|
|
95
|
+
if attempt >= attempts:
|
|
96
|
+
raise requests.RequestException(
|
|
97
|
+
f"Failed to download ZIP from {url} after {attempts} attempts."
|
|
98
|
+
) from e
|
|
99
|
+
sleep_time = backoff_factor * (2 ** (attempt - 1))
|
|
100
|
+
if sleep_time:
|
|
101
|
+
time.sleep(sleep_time)
|
|
102
|
+
except zipfile.BadZipFile as e:
|
|
103
|
+
print_verbose(f"Error: Invalid ZIP file from {url}: {e}", verbose=verbose)
|
|
104
|
+
raise
|
|
105
|
+
except Exception as e:
|
|
106
|
+
print_verbose(f"Error processing ZIP file: {e}", verbose=verbose)
|
|
107
|
+
raise
|
|
51
108
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
# Unlike utils.strings.pathing(), extracting should work even if the
|
|
56
|
-
# destination directory doesn't exist yet.
|
|
57
|
-
extract_root = pathlib.Path(path)
|
|
58
|
-
extract_root.mkdir(parents=True, exist_ok=True)
|
|
59
|
-
for zip_info in zip_ref.infolist():
|
|
60
|
-
extract_path = extract_root / zip_info.filename
|
|
61
|
-
extract_path.parent.mkdir(parents=True, exist_ok=True)
|
|
62
|
-
with open(extract_path, "wb") as f:
|
|
63
|
-
f.write(zip_ref.read(zip_info.filename))
|
|
64
|
-
return None
|
|
65
|
-
except requests.exceptions.RequestException as e:
|
|
66
|
-
print_verbose(f"Error downloading ZIP from {url}: {e}", verbose=verbose)
|
|
67
|
-
raise
|
|
68
|
-
except zipfile.BadZipFile as e:
|
|
69
|
-
print_verbose(f"Error: Invalid ZIP file from {url}: {e}", verbose=verbose)
|
|
70
|
-
raise
|
|
71
|
-
except Exception as e:
|
|
72
|
-
print_verbose(f"Error processing ZIP file: {e}", verbose=verbose)
|
|
73
|
-
raise
|
|
109
|
+
raise requests.RequestException(
|
|
110
|
+
f"Failed to download ZIP from {url} after {attempts} attempts."
|
|
111
|
+
)
|