simtoolsz 0.2.15__tar.gz → 0.2.16__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/.gitignore +1 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/PKG-INFO +1 -1
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/pyproject.toml +1 -1
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/requirements-dev.lock +2 -2
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/requirements.lock +2 -2
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/src/simtoolsz/__init__.py +1 -1
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/src/simtoolsz/reader.py +31 -23
- simtoolsz-0.2.16/tests/test_load_data_optimization.py +185 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/.github/workflows/publish.yml +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/.github/workflows/test.yml +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/.python-version +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/LICENSE +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/README.md +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/README_EN.md +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/README_countrycode.md +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/README_countrycode_en.md +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/docs/DATETIME_CONVERSION.md +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/docs/iso3166-1.xlsx +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/docs/mail_usage_guide.md +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/docs/special2db_usage.md +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/examples/conversion_examples.py +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/examples/mail_examples.py +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/examples/special2db_example.py +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/examples/today_examples.py +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/examples/zip2db_example.py +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/src/simtoolsz/columns_info +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/src/simtoolsz/country.parquet +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/src/simtoolsz/countrycode.py +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/src/simtoolsz/datetime.py +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/src/simtoolsz/db.py +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/src/simtoolsz/mail.py +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/src/simtoolsz/math.py +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/src/simtoolsz/utils.py +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/tests/test_conversion.py +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/tests/test_countrycode.py +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/tests/test_countrycode_optimization.py +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/tests/test_iso_comprehensive.py +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/tests/test_iso_format.py +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/tests/test_optimized_reader.py +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/tests/test_simple.py +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/tests/test_smoke.py +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/tests/test_special2db.py +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/tests/test_special2db_simple.py +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/tests/test_today_optimized.py +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/tests/test_which_format.py +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/tests/test_zip2db.py +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/tests/test_zip2db_simple.py +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/tests/verify_unicode_fix.py +0 -0
- {simtoolsz-0.2.15 → simtoolsz-0.2.16}/uv.lock +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: simtoolsz
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.16
|
|
4
4
|
Summary: A simple and convenient toolkit containing useful functions, classes, and methods.
|
|
5
5
|
Project-URL: Homepage, https://github.com/SidneyLYZhang/simtoolsz
|
|
6
6
|
Project-URL: Repository, https://github.com/SidneyLYZhang/simtoolsz.git
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
-e file:.
|
|
13
13
|
duckdb==1.4.0
|
|
14
14
|
# via simtoolsz
|
|
15
|
-
numpy==2.
|
|
15
|
+
numpy==2.5.0
|
|
16
16
|
# via simtoolsz
|
|
17
17
|
pendulum==3.1.0
|
|
18
18
|
# via simtoolsz
|
|
@@ -20,7 +20,7 @@ polars==1.35.1
|
|
|
20
20
|
# via simtoolsz
|
|
21
21
|
polars-runtime-32==1.35.1
|
|
22
22
|
# via polars
|
|
23
|
-
pyarrow==
|
|
23
|
+
pyarrow==24.0.0
|
|
24
24
|
# via simtoolsz
|
|
25
25
|
python-dateutil==2.9.0.post0
|
|
26
26
|
# via pendulum
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
-e file:.
|
|
13
13
|
duckdb==1.4.0
|
|
14
14
|
# via simtoolsz
|
|
15
|
-
numpy==2.
|
|
15
|
+
numpy==2.5.0
|
|
16
16
|
# via simtoolsz
|
|
17
17
|
pendulum==3.1.0
|
|
18
18
|
# via simtoolsz
|
|
@@ -20,7 +20,7 @@ polars==1.35.1
|
|
|
20
20
|
# via simtoolsz
|
|
21
21
|
polars-runtime-32==1.35.1
|
|
22
22
|
# via polars
|
|
23
|
-
pyarrow==
|
|
23
|
+
pyarrow==24.0.0
|
|
24
24
|
# via simtoolsz
|
|
25
25
|
python-dateutil==2.9.0.post0
|
|
26
26
|
# via pendulum
|
|
@@ -108,6 +108,29 @@ def scan_tsv(filepath: Path, **kwargs) -> pl.LazyFrame:
|
|
|
108
108
|
return read_tsv(filepath, lazy=True, **kwargs)
|
|
109
109
|
|
|
110
110
|
|
|
111
|
+
_NON_ARCHIVE_EXTS = {".csv", ".tsv", ".json", ".parquet", ".ipc", ".avro", ".xlsx", ".xls", ".ods"}
|
|
112
|
+
_ARCHIVE_EXTS = {".zip", ".tar", ".gz", ".bz2"}
|
|
113
|
+
|
|
114
|
+
_EAGER_READERS: dict[str, Callable] = {
|
|
115
|
+
"csv": pl.read_csv,
|
|
116
|
+
"tsv": read_tsv,
|
|
117
|
+
"xls": pl.read_excel,
|
|
118
|
+
"xlsx": pl.read_excel,
|
|
119
|
+
"ods": pl.read_ods,
|
|
120
|
+
"json": pl.read_json,
|
|
121
|
+
"parquet": pl.read_parquet,
|
|
122
|
+
"ipc": pl.read_ipc,
|
|
123
|
+
"avro": pl.read_avro,
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
_LAZY_READERS: dict[str, Callable] = {
|
|
127
|
+
"csv": pl.scan_csv,
|
|
128
|
+
"parquet": pl.scan_parquet,
|
|
129
|
+
"ipc": pl.scan_ipc,
|
|
130
|
+
"tsv": scan_tsv,
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
|
|
111
134
|
def _validate_input(
|
|
112
135
|
file_path: Path | str, format_type: Optional[str]
|
|
113
136
|
) -> tuple[Path, str]:
|
|
@@ -157,25 +180,7 @@ def _get_reader_mapping(lazy: bool) -> dict[str, Callable]:
|
|
|
157
180
|
Returns:
|
|
158
181
|
dict: 格式到读取函数的映射
|
|
159
182
|
"""
|
|
160
|
-
if lazy
|
|
161
|
-
return {
|
|
162
|
-
"csv": pl.scan_csv,
|
|
163
|
-
"parquet": pl.scan_parquet,
|
|
164
|
-
"ipc": pl.scan_ipc,
|
|
165
|
-
"tsv": scan_tsv,
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
return {
|
|
169
|
-
"csv": pl.read_csv,
|
|
170
|
-
"tsv": read_tsv,
|
|
171
|
-
"xls": pl.read_excel,
|
|
172
|
-
"xlsx": pl.read_excel,
|
|
173
|
-
"ods": pl.read_ods,
|
|
174
|
-
"json": pl.read_json,
|
|
175
|
-
"parquet": pl.read_parquet,
|
|
176
|
-
"ipc": pl.read_ipc,
|
|
177
|
-
"avro": pl.read_avro,
|
|
178
|
-
}
|
|
183
|
+
return _LAZY_READERS if lazy else _EAGER_READERS
|
|
179
184
|
|
|
180
185
|
|
|
181
186
|
def _handle_focus_mode(fmt: str) -> Callable:
|
|
@@ -291,6 +296,9 @@ def is_archive_file(file_path: Path) -> bool:
|
|
|
291
296
|
Returns:
|
|
292
297
|
bool: 如果是压缩文件或在压缩文件中返回True
|
|
293
298
|
"""
|
|
299
|
+
if file_path.suffix.lower() in _NON_ARCHIVE_EXTS:
|
|
300
|
+
if not any(p.suffix.lower() in _ARCHIVE_EXTS for p in file_path.parents):
|
|
301
|
+
return False
|
|
294
302
|
return _is_archive_file(file_path) or any(
|
|
295
303
|
_is_archive_file(p) for p in file_path.parents
|
|
296
304
|
)
|
|
@@ -544,6 +552,8 @@ def load_data(
|
|
|
544
552
|
>>> lazy_df = load_data("data.parquet", lazy=True)
|
|
545
553
|
>>> df = load_data("data.zip/users.csv")
|
|
546
554
|
"""
|
|
555
|
+
file_path = Path(file_path)
|
|
556
|
+
|
|
547
557
|
if is_archive_file(file_path):
|
|
548
558
|
df = read_archive(file_path, format_type=format_type, **kwargs)
|
|
549
559
|
else:
|
|
@@ -557,10 +567,8 @@ def load_data(
|
|
|
557
567
|
df = reader(file_path, **kwargs)
|
|
558
568
|
|
|
559
569
|
if transtype is not None:
|
|
560
|
-
if isinstance(transtype, pl.Expr)
|
|
561
|
-
|
|
562
|
-
if isinstance(transtype, list):
|
|
563
|
-
df = df.with_columns(*transtype)
|
|
570
|
+
exprs = [transtype] if isinstance(transtype, pl.Expr) else transtype
|
|
571
|
+
df = df.with_columns(exprs)
|
|
564
572
|
return df
|
|
565
573
|
|
|
566
574
|
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Tests for load_data dispatch path optimizations.
|
|
3
|
+
|
|
4
|
+
Covers:
|
|
5
|
+
1. is_archive_file short-circuit for known non-archive extensions
|
|
6
|
+
2. str input compatibility (no AttributeError)
|
|
7
|
+
3. reader mapping cached as module-level constants
|
|
8
|
+
4. transtype single with_columns application (Expr / list / None)
|
|
9
|
+
5. regression: archive path data.zip/users.csv still reads correctly
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import sys
|
|
13
|
+
import tempfile
|
|
14
|
+
import zipfile
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from unittest.mock import patch
|
|
17
|
+
|
|
18
|
+
import polars as pl
|
|
19
|
+
|
|
20
|
+
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
21
|
+
|
|
22
|
+
from simtoolsz.reader import (
|
|
23
|
+
load_data,
|
|
24
|
+
is_archive_file,
|
|
25
|
+
_get_reader_mapping,
|
|
26
|
+
_EAGER_READERS,
|
|
27
|
+
_LAZY_READERS,
|
|
28
|
+
_NON_ARCHIVE_EXTS,
|
|
29
|
+
_ARCHIVE_EXTS,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _make_csv(path: Path, content: str = "x,y\n1,2\n3,4\n"):
|
|
34
|
+
path.write_text(content, encoding="utf-8")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def test_short_circuit_skips_magic_bytes_check():
|
|
38
|
+
"""Known non-archive extension should not trigger is_zipfile/is_tarfile."""
|
|
39
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
40
|
+
csv_path = Path(tmp) / "data.csv"
|
|
41
|
+
_make_csv(csv_path)
|
|
42
|
+
|
|
43
|
+
with patch("simtoolsz.reader.is_zipfile") as mock_zip, \
|
|
44
|
+
patch("simtoolsz.reader.is_tarfile") as mock_tar:
|
|
45
|
+
mock_zip.return_value = False
|
|
46
|
+
mock_tar.return_value = False
|
|
47
|
+
result = is_archive_file(csv_path)
|
|
48
|
+
assert result is False
|
|
49
|
+
assert mock_zip.call_count == 0, "is_zipfile should not be called for .csv"
|
|
50
|
+
assert mock_tar.call_count == 0, "is_tarfile should not be called for .csv"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def test_short_circuit_for_all_non_archive_exts():
|
|
54
|
+
"""Every extension in _NON_ARCHIVE_EXTS should short-circuit."""
|
|
55
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
56
|
+
for ext in _NON_ARCHIVE_EXTS:
|
|
57
|
+
p = Path(tmp) / f"file{ext}"
|
|
58
|
+
_make_csv(p)
|
|
59
|
+
with patch("simtoolsz.reader.is_zipfile") as mock_zip, \
|
|
60
|
+
patch("simtoolsz.reader.is_tarfile") as mock_tar:
|
|
61
|
+
mock_zip.return_value = False
|
|
62
|
+
mock_tar.return_value = False
|
|
63
|
+
assert is_archive_file(p) is False, f"short-circuit failed for {ext}"
|
|
64
|
+
assert mock_zip.call_count == 0, f"is_zipfile called for {ext}"
|
|
65
|
+
assert mock_tar.call_count == 0, f"is_tarfile called for {ext}"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def test_str_input_no_attribute_error():
|
|
69
|
+
"""load_data with str input should not raise AttributeError."""
|
|
70
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
71
|
+
csv_path = Path(tmp) / "data.csv"
|
|
72
|
+
_make_csv(csv_path)
|
|
73
|
+
|
|
74
|
+
df = load_data(str(csv_path))
|
|
75
|
+
assert df.shape == (2, 2)
|
|
76
|
+
assert df.columns == ["x", "y"]
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def test_reader_mapping_cached():
|
|
80
|
+
"""_get_reader_mapping should return the same constant object."""
|
|
81
|
+
assert _get_reader_mapping(lazy=True) is _LAZY_READERS
|
|
82
|
+
assert _get_reader_mapping(lazy=False) is _EAGER_READERS
|
|
83
|
+
assert _get_reader_mapping(lazy=True) is _get_reader_mapping(lazy=True)
|
|
84
|
+
assert _get_reader_mapping(lazy=False) is _get_reader_mapping(lazy=False)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def test_transtype_single_expr():
|
|
88
|
+
"""transtype as single Expr applied via single with_columns."""
|
|
89
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
90
|
+
csv_path = Path(tmp) / "data.csv"
|
|
91
|
+
_make_csv(csv_path)
|
|
92
|
+
|
|
93
|
+
df = load_data(csv_path, transtype=pl.col("x").cast(pl.Int32))
|
|
94
|
+
assert df.schema["x"] == pl.Int32
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def test_transtype_list():
|
|
98
|
+
"""transtype as list applied via single with_columns."""
|
|
99
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
100
|
+
csv_path = Path(tmp) / "data.csv"
|
|
101
|
+
_make_csv(csv_path)
|
|
102
|
+
|
|
103
|
+
df = load_data(csv_path, transtype=[
|
|
104
|
+
pl.col("x").cast(pl.Int32),
|
|
105
|
+
pl.col("y").cast(pl.Float64),
|
|
106
|
+
])
|
|
107
|
+
assert df.schema["x"] == pl.Int32
|
|
108
|
+
assert df.schema["y"] == pl.Float64
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def test_transtype_none():
|
|
112
|
+
"""transtype=None should not call with_columns, behavior unchanged."""
|
|
113
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
114
|
+
csv_path = Path(tmp) / "data.csv"
|
|
115
|
+
_make_csv(csv_path)
|
|
116
|
+
|
|
117
|
+
df = load_data(csv_path)
|
|
118
|
+
assert df.shape == (2, 2)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def test_archive_path_still_reads():
|
|
122
|
+
"""Regression: data.zip/users.csv should still be read correctly."""
|
|
123
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
124
|
+
zip_path = Path(tmp) / "data.zip"
|
|
125
|
+
with zipfile.ZipFile(zip_path, "w") as zf:
|
|
126
|
+
zf.writestr("users.csv", "x,y\n1,2\n3,4\n")
|
|
127
|
+
|
|
128
|
+
inner_path = zip_path / "users.csv"
|
|
129
|
+
df = load_data(inner_path)
|
|
130
|
+
assert df.shape == (2, 2)
|
|
131
|
+
assert df.columns == ["x", "y"]
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def test_unknown_extension_falls_back():
|
|
135
|
+
"""Unknown extension should fall back to original magic-bytes check."""
|
|
136
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
137
|
+
no_ext_path = Path(tmp) / "data"
|
|
138
|
+
_make_csv(no_ext_path)
|
|
139
|
+
|
|
140
|
+
result = is_archive_file(no_ext_path)
|
|
141
|
+
assert result is False
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def test_constants_defined():
|
|
145
|
+
"""Verify required constants exist and have expected content."""
|
|
146
|
+
assert ".csv" in _NON_ARCHIVE_EXTS
|
|
147
|
+
assert ".parquet" in _NON_ARCHIVE_EXTS
|
|
148
|
+
assert ".zip" in _ARCHIVE_EXTS
|
|
149
|
+
assert ".tar" in _ARCHIVE_EXTS
|
|
150
|
+
assert "csv" in _EAGER_READERS
|
|
151
|
+
assert "csv" in _LAZY_READERS
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def main():
|
|
155
|
+
tests = [
|
|
156
|
+
test_short_circuit_skips_magic_bytes_check,
|
|
157
|
+
test_short_circuit_for_all_non_archive_exts,
|
|
158
|
+
test_str_input_no_attribute_error,
|
|
159
|
+
test_reader_mapping_cached,
|
|
160
|
+
test_transtype_single_expr,
|
|
161
|
+
test_transtype_list,
|
|
162
|
+
test_transtype_none,
|
|
163
|
+
test_archive_path_still_reads,
|
|
164
|
+
test_unknown_extension_falls_back,
|
|
165
|
+
test_constants_defined,
|
|
166
|
+
]
|
|
167
|
+
failed = 0
|
|
168
|
+
for t in tests:
|
|
169
|
+
try:
|
|
170
|
+
t()
|
|
171
|
+
print(f"PASS {t.__name__}")
|
|
172
|
+
except Exception as e:
|
|
173
|
+
print(f"FAIL {t.__name__}: {e}")
|
|
174
|
+
import traceback
|
|
175
|
+
traceback.print_exc()
|
|
176
|
+
failed += 1
|
|
177
|
+
if failed:
|
|
178
|
+
print(f"\n{failed}/{len(tests)} tests failed")
|
|
179
|
+
return 1
|
|
180
|
+
print(f"\n{len(tests)}/{len(tests)} tests passed")
|
|
181
|
+
return 0
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
if __name__ == "__main__":
|
|
185
|
+
sys.exit(main())
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|