simtoolsz 0.2.15__tar.gz → 0.2.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/.gitignore +1 -0
  2. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/PKG-INFO +1 -1
  3. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/pyproject.toml +1 -1
  4. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/requirements-dev.lock +2 -2
  5. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/requirements.lock +2 -2
  6. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/src/simtoolsz/__init__.py +1 -1
  7. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/src/simtoolsz/reader.py +31 -23
  8. simtoolsz-0.2.16/tests/test_load_data_optimization.py +185 -0
  9. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/.github/workflows/publish.yml +0 -0
  10. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/.github/workflows/test.yml +0 -0
  11. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/.python-version +0 -0
  12. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/LICENSE +0 -0
  13. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/README.md +0 -0
  14. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/README_EN.md +0 -0
  15. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/README_countrycode.md +0 -0
  16. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/README_countrycode_en.md +0 -0
  17. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/docs/DATETIME_CONVERSION.md +0 -0
  18. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/docs/iso3166-1.xlsx +0 -0
  19. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/docs/mail_usage_guide.md +0 -0
  20. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/docs/special2db_usage.md +0 -0
  21. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/examples/conversion_examples.py +0 -0
  22. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/examples/mail_examples.py +0 -0
  23. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/examples/special2db_example.py +0 -0
  24. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/examples/today_examples.py +0 -0
  25. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/examples/zip2db_example.py +0 -0
  26. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/src/simtoolsz/columns_info +0 -0
  27. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/src/simtoolsz/country.parquet +0 -0
  28. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/src/simtoolsz/countrycode.py +0 -0
  29. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/src/simtoolsz/datetime.py +0 -0
  30. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/src/simtoolsz/db.py +0 -0
  31. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/src/simtoolsz/mail.py +0 -0
  32. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/src/simtoolsz/math.py +0 -0
  33. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/src/simtoolsz/utils.py +0 -0
  34. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/tests/test_conversion.py +0 -0
  35. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/tests/test_countrycode.py +0 -0
  36. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/tests/test_countrycode_optimization.py +0 -0
  37. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/tests/test_iso_comprehensive.py +0 -0
  38. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/tests/test_iso_format.py +0 -0
  39. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/tests/test_optimized_reader.py +0 -0
  40. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/tests/test_simple.py +0 -0
  41. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/tests/test_smoke.py +0 -0
  42. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/tests/test_special2db.py +0 -0
  43. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/tests/test_special2db_simple.py +0 -0
  44. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/tests/test_today_optimized.py +0 -0
  45. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/tests/test_which_format.py +0 -0
  46. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/tests/test_zip2db.py +0 -0
  47. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/tests/test_zip2db_simple.py +0 -0
  48. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/tests/verify_unicode_fix.py +0 -0
  49. {simtoolsz-0.2.15 → simtoolsz-0.2.16}/uv.lock +0 -0
@@ -9,3 +9,4 @@ wheels/
9
9
 
10
10
  # venv
11
11
  .venv
12
+ .trae/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: simtoolsz
3
- Version: 0.2.15
3
+ Version: 0.2.16
4
4
  Summary: A simple and convenient toolkit containing useful functions, classes, and methods.
5
5
  Project-URL: Homepage, https://github.com/SidneyLYZhang/simtoolsz
6
6
  Project-URL: Repository, https://github.com/SidneyLYZhang/simtoolsz.git
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "simtoolsz"
3
- version = "0.2.15"
3
+ version = "0.2.16"
4
4
  description = "A simple and convenient toolkit containing useful functions, classes, and methods."
5
5
  keywords = ["tool", "collection"]
6
6
  license = { text = "MulanPSL-2.0" }
@@ -12,7 +12,7 @@
12
12
  -e file:.
13
13
  duckdb==1.4.0
14
14
  # via simtoolsz
15
- numpy==2.4.4
15
+ numpy==2.5.0
16
16
  # via simtoolsz
17
17
  pendulum==3.1.0
18
18
  # via simtoolsz
@@ -20,7 +20,7 @@ polars==1.35.1
20
20
  # via simtoolsz
21
21
  polars-runtime-32==1.35.1
22
22
  # via polars
23
- pyarrow==23.0.1
23
+ pyarrow==24.0.0
24
24
  # via simtoolsz
25
25
  python-dateutil==2.9.0.post0
26
26
  # via pendulum
@@ -12,7 +12,7 @@
12
12
  -e file:.
13
13
  duckdb==1.4.0
14
14
  # via simtoolsz
15
- numpy==2.4.4
15
+ numpy==2.5.0
16
16
  # via simtoolsz
17
17
  pendulum==3.1.0
18
18
  # via simtoolsz
@@ -20,7 +20,7 @@ polars==1.35.1
20
20
  # via simtoolsz
21
21
  polars-runtime-32==1.35.1
22
22
  # via polars
23
- pyarrow==23.0.1
23
+ pyarrow==24.0.0
24
24
  # via simtoolsz
25
25
  python-dateutil==2.9.0.post0
26
26
  # via pendulum
@@ -26,7 +26,7 @@ import simtoolsz.math as math
26
26
  try:
27
27
  __version__ = importlib.metadata.version("simtoolsz")
28
28
  except importlib.metadata.PackageNotFoundError:
29
- __version__ = "0.2.15"
29
+ __version__ = "0.2.16"
30
30
 
31
31
  __author__ = "Sidney Zhang <zly@lyzhang.me>"
32
32
 
@@ -108,6 +108,29 @@ def scan_tsv(filepath: Path, **kwargs) -> pl.LazyFrame:
108
108
  return read_tsv(filepath, lazy=True, **kwargs)
109
109
 
110
110
 
111
+ _NON_ARCHIVE_EXTS = {".csv", ".tsv", ".json", ".parquet", ".ipc", ".avro", ".xlsx", ".xls", ".ods"}
112
+ _ARCHIVE_EXTS = {".zip", ".tar", ".gz", ".bz2"}
113
+
114
+ _EAGER_READERS: dict[str, Callable] = {
115
+ "csv": pl.read_csv,
116
+ "tsv": read_tsv,
117
+ "xls": pl.read_excel,
118
+ "xlsx": pl.read_excel,
119
+ "ods": pl.read_ods,
120
+ "json": pl.read_json,
121
+ "parquet": pl.read_parquet,
122
+ "ipc": pl.read_ipc,
123
+ "avro": pl.read_avro,
124
+ }
125
+
126
+ _LAZY_READERS: dict[str, Callable] = {
127
+ "csv": pl.scan_csv,
128
+ "parquet": pl.scan_parquet,
129
+ "ipc": pl.scan_ipc,
130
+ "tsv": scan_tsv,
131
+ }
132
+
133
+
111
134
  def _validate_input(
112
135
  file_path: Path | str, format_type: Optional[str]
113
136
  ) -> tuple[Path, str]:
@@ -157,25 +180,7 @@ def _get_reader_mapping(lazy: bool) -> dict[str, Callable]:
157
180
  Returns:
158
181
  dict: 格式到读取函数的映射
159
182
  """
160
- if lazy:
161
- return {
162
- "csv": pl.scan_csv,
163
- "parquet": pl.scan_parquet,
164
- "ipc": pl.scan_ipc,
165
- "tsv": scan_tsv,
166
- }
167
-
168
- return {
169
- "csv": pl.read_csv,
170
- "tsv": read_tsv,
171
- "xls": pl.read_excel,
172
- "xlsx": pl.read_excel,
173
- "ods": pl.read_ods,
174
- "json": pl.read_json,
175
- "parquet": pl.read_parquet,
176
- "ipc": pl.read_ipc,
177
- "avro": pl.read_avro,
178
- }
183
+ return _LAZY_READERS if lazy else _EAGER_READERS
179
184
 
180
185
 
181
186
  def _handle_focus_mode(fmt: str) -> Callable:
@@ -291,6 +296,9 @@ def is_archive_file(file_path: Path) -> bool:
291
296
  Returns:
292
297
  bool: 如果是压缩文件或在压缩文件中返回True
293
298
  """
299
+ if file_path.suffix.lower() in _NON_ARCHIVE_EXTS:
300
+ if not any(p.suffix.lower() in _ARCHIVE_EXTS for p in file_path.parents):
301
+ return False
294
302
  return _is_archive_file(file_path) or any(
295
303
  _is_archive_file(p) for p in file_path.parents
296
304
  )
@@ -544,6 +552,8 @@ def load_data(
544
552
  >>> lazy_df = load_data("data.parquet", lazy=True)
545
553
  >>> df = load_data("data.zip/users.csv")
546
554
  """
555
+ file_path = Path(file_path)
556
+
547
557
  if is_archive_file(file_path):
548
558
  df = read_archive(file_path, format_type=format_type, **kwargs)
549
559
  else:
@@ -557,10 +567,8 @@ def load_data(
557
567
  df = reader(file_path, **kwargs)
558
568
 
559
569
  if transtype is not None:
560
- if isinstance(transtype, pl.Expr):
561
- df = df.with_columns(transtype)
562
- if isinstance(transtype, list):
563
- df = df.with_columns(*transtype)
570
+ exprs = [transtype] if isinstance(transtype, pl.Expr) else transtype
571
+ df = df.with_columns(exprs)
564
572
  return df
565
573
 
566
574
 
@@ -0,0 +1,185 @@
1
+ #!/usr/bin/env python3
2
+ """Tests for load_data dispatch path optimizations.
3
+
4
+ Covers:
5
+ 1. is_archive_file short-circuit for known non-archive extensions
6
+ 2. str input compatibility (no AttributeError)
7
+ 3. reader mapping cached as module-level constants
8
+ 4. transtype single with_columns application (Expr / list / None)
9
+ 5. regression: archive path data.zip/users.csv still reads correctly
10
+ """
11
+
12
+ import sys
13
+ import tempfile
14
+ import zipfile
15
+ from pathlib import Path
16
+ from unittest.mock import patch
17
+
18
+ import polars as pl
19
+
20
+ sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
21
+
22
+ from simtoolsz.reader import (
23
+ load_data,
24
+ is_archive_file,
25
+ _get_reader_mapping,
26
+ _EAGER_READERS,
27
+ _LAZY_READERS,
28
+ _NON_ARCHIVE_EXTS,
29
+ _ARCHIVE_EXTS,
30
+ )
31
+
32
+
33
+ def _make_csv(path: Path, content: str = "x,y\n1,2\n3,4\n"):
34
+ path.write_text(content, encoding="utf-8")
35
+
36
+
37
+ def test_short_circuit_skips_magic_bytes_check():
38
+ """Known non-archive extension should not trigger is_zipfile/is_tarfile."""
39
+ with tempfile.TemporaryDirectory() as tmp:
40
+ csv_path = Path(tmp) / "data.csv"
41
+ _make_csv(csv_path)
42
+
43
+ with patch("simtoolsz.reader.is_zipfile") as mock_zip, \
44
+ patch("simtoolsz.reader.is_tarfile") as mock_tar:
45
+ mock_zip.return_value = False
46
+ mock_tar.return_value = False
47
+ result = is_archive_file(csv_path)
48
+ assert result is False
49
+ assert mock_zip.call_count == 0, "is_zipfile should not be called for .csv"
50
+ assert mock_tar.call_count == 0, "is_tarfile should not be called for .csv"
51
+
52
+
53
+ def test_short_circuit_for_all_non_archive_exts():
54
+ """Every extension in _NON_ARCHIVE_EXTS should short-circuit."""
55
+ with tempfile.TemporaryDirectory() as tmp:
56
+ for ext in _NON_ARCHIVE_EXTS:
57
+ p = Path(tmp) / f"file{ext}"
58
+ _make_csv(p)
59
+ with patch("simtoolsz.reader.is_zipfile") as mock_zip, \
60
+ patch("simtoolsz.reader.is_tarfile") as mock_tar:
61
+ mock_zip.return_value = False
62
+ mock_tar.return_value = False
63
+ assert is_archive_file(p) is False, f"short-circuit failed for {ext}"
64
+ assert mock_zip.call_count == 0, f"is_zipfile called for {ext}"
65
+ assert mock_tar.call_count == 0, f"is_tarfile called for {ext}"
66
+
67
+
68
+ def test_str_input_no_attribute_error():
69
+ """load_data with str input should not raise AttributeError."""
70
+ with tempfile.TemporaryDirectory() as tmp:
71
+ csv_path = Path(tmp) / "data.csv"
72
+ _make_csv(csv_path)
73
+
74
+ df = load_data(str(csv_path))
75
+ assert df.shape == (2, 2)
76
+ assert df.columns == ["x", "y"]
77
+
78
+
79
+ def test_reader_mapping_cached():
80
+ """_get_reader_mapping should return the same constant object."""
81
+ assert _get_reader_mapping(lazy=True) is _LAZY_READERS
82
+ assert _get_reader_mapping(lazy=False) is _EAGER_READERS
83
+ assert _get_reader_mapping(lazy=True) is _get_reader_mapping(lazy=True)
84
+ assert _get_reader_mapping(lazy=False) is _get_reader_mapping(lazy=False)
85
+
86
+
87
+ def test_transtype_single_expr():
88
+ """transtype as single Expr applied via single with_columns."""
89
+ with tempfile.TemporaryDirectory() as tmp:
90
+ csv_path = Path(tmp) / "data.csv"
91
+ _make_csv(csv_path)
92
+
93
+ df = load_data(csv_path, transtype=pl.col("x").cast(pl.Int32))
94
+ assert df.schema["x"] == pl.Int32
95
+
96
+
97
+ def test_transtype_list():
98
+ """transtype as list applied via single with_columns."""
99
+ with tempfile.TemporaryDirectory() as tmp:
100
+ csv_path = Path(tmp) / "data.csv"
101
+ _make_csv(csv_path)
102
+
103
+ df = load_data(csv_path, transtype=[
104
+ pl.col("x").cast(pl.Int32),
105
+ pl.col("y").cast(pl.Float64),
106
+ ])
107
+ assert df.schema["x"] == pl.Int32
108
+ assert df.schema["y"] == pl.Float64
109
+
110
+
111
+ def test_transtype_none():
112
+ """transtype=None should not call with_columns, behavior unchanged."""
113
+ with tempfile.TemporaryDirectory() as tmp:
114
+ csv_path = Path(tmp) / "data.csv"
115
+ _make_csv(csv_path)
116
+
117
+ df = load_data(csv_path)
118
+ assert df.shape == (2, 2)
119
+
120
+
121
+ def test_archive_path_still_reads():
122
+ """Regression: data.zip/users.csv should still be read correctly."""
123
+ with tempfile.TemporaryDirectory() as tmp:
124
+ zip_path = Path(tmp) / "data.zip"
125
+ with zipfile.ZipFile(zip_path, "w") as zf:
126
+ zf.writestr("users.csv", "x,y\n1,2\n3,4\n")
127
+
128
+ inner_path = zip_path / "users.csv"
129
+ df = load_data(inner_path)
130
+ assert df.shape == (2, 2)
131
+ assert df.columns == ["x", "y"]
132
+
133
+
134
+ def test_unknown_extension_falls_back():
135
+ """Unknown extension should fall back to original magic-bytes check."""
136
+ with tempfile.TemporaryDirectory() as tmp:
137
+ no_ext_path = Path(tmp) / "data"
138
+ _make_csv(no_ext_path)
139
+
140
+ result = is_archive_file(no_ext_path)
141
+ assert result is False
142
+
143
+
144
+ def test_constants_defined():
145
+ """Verify required constants exist and have expected content."""
146
+ assert ".csv" in _NON_ARCHIVE_EXTS
147
+ assert ".parquet" in _NON_ARCHIVE_EXTS
148
+ assert ".zip" in _ARCHIVE_EXTS
149
+ assert ".tar" in _ARCHIVE_EXTS
150
+ assert "csv" in _EAGER_READERS
151
+ assert "csv" in _LAZY_READERS
152
+
153
+
154
+ def main():
155
+ tests = [
156
+ test_short_circuit_skips_magic_bytes_check,
157
+ test_short_circuit_for_all_non_archive_exts,
158
+ test_str_input_no_attribute_error,
159
+ test_reader_mapping_cached,
160
+ test_transtype_single_expr,
161
+ test_transtype_list,
162
+ test_transtype_none,
163
+ test_archive_path_still_reads,
164
+ test_unknown_extension_falls_back,
165
+ test_constants_defined,
166
+ ]
167
+ failed = 0
168
+ for t in tests:
169
+ try:
170
+ t()
171
+ print(f"PASS {t.__name__}")
172
+ except Exception as e:
173
+ print(f"FAIL {t.__name__}: {e}")
174
+ import traceback
175
+ traceback.print_exc()
176
+ failed += 1
177
+ if failed:
178
+ print(f"\n{failed}/{len(tests)} tests failed")
179
+ return 1
180
+ print(f"\n{len(tests)}/{len(tests)} tests passed")
181
+ return 0
182
+
183
+
184
+ if __name__ == "__main__":
185
+ sys.exit(main())
File without changes
File without changes
File without changes
File without changes
File without changes