zoopipe 2026.1.20__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. zoopipe/__init__.py +72 -0
  2. zoopipe/engines/__init__.py +4 -0
  3. zoopipe/engines/base.py +45 -0
  4. zoopipe/engines/dask.py +225 -0
  5. zoopipe/engines/local.py +215 -0
  6. zoopipe/engines/ray.py +252 -0
  7. zoopipe/hooks/__init__.py +4 -0
  8. zoopipe/hooks/base.py +70 -0
  9. zoopipe/hooks/sql.py +94 -0
  10. zoopipe/input_adapter/__init__.py +24 -0
  11. zoopipe/input_adapter/arrow.py +38 -0
  12. zoopipe/input_adapter/base.py +48 -0
  13. zoopipe/input_adapter/csv.py +144 -0
  14. zoopipe/input_adapter/duckdb.py +54 -0
  15. zoopipe/input_adapter/excel.py +51 -0
  16. zoopipe/input_adapter/json.py +73 -0
  17. zoopipe/input_adapter/kafka.py +39 -0
  18. zoopipe/input_adapter/parquet.py +85 -0
  19. zoopipe/input_adapter/pygen.py +37 -0
  20. zoopipe/input_adapter/sql.py +103 -0
  21. zoopipe/manager.py +211 -0
  22. zoopipe/output_adapter/__init__.py +23 -0
  23. zoopipe/output_adapter/arrow.py +50 -0
  24. zoopipe/output_adapter/base.py +41 -0
  25. zoopipe/output_adapter/csv.py +71 -0
  26. zoopipe/output_adapter/duckdb.py +46 -0
  27. zoopipe/output_adapter/excel.py +42 -0
  28. zoopipe/output_adapter/json.py +66 -0
  29. zoopipe/output_adapter/kafka.py +39 -0
  30. zoopipe/output_adapter/parquet.py +49 -0
  31. zoopipe/output_adapter/pygen.py +29 -0
  32. zoopipe/output_adapter/sql.py +43 -0
  33. zoopipe/pipe.py +263 -0
  34. zoopipe/protocols.py +37 -0
  35. zoopipe/py.typed +0 -0
  36. zoopipe/report.py +173 -0
  37. zoopipe/utils/__init__.py +0 -0
  38. zoopipe/utils/dependency.py +78 -0
  39. zoopipe/zoopipe_rust_core.abi3.so +0 -0
  40. zoopipe-2026.1.20.dist-info/METADATA +231 -0
  41. zoopipe-2026.1.20.dist-info/RECORD +43 -0
  42. zoopipe-2026.1.20.dist-info/WHEEL +4 -0
  43. zoopipe-2026.1.20.dist-info/licenses/LICENSE +21 -0
zoopipe/engines/ray.py ADDED
@@ -0,0 +1,252 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import re
5
+ from datetime import datetime
6
+ from importlib import metadata
7
+ from typing import TYPE_CHECKING, Any
8
+
9
+ import ray
10
+
11
+ from zoopipe.engines.base import BaseEngine
12
+ from zoopipe.engines.local import PipeReport
13
+ from zoopipe.report import FlowReport, FlowStatus
14
+ from zoopipe.utils.dependency import install_dependencies
15
+
16
+ if TYPE_CHECKING:
17
+ from zoopipe.pipe import Pipe
18
+
19
+
20
+ @ray.remote(memory=512 * 1024 * 1024) # Limit actor memory to 512MB
21
+ class RayPipeWorker:
22
+ """
23
+ Ray Actor that wraps a single Pipe execution.
24
+ """
25
+
26
+ def __init__(self, pipe: Pipe, index: int):
27
+ self.pipe = pipe
28
+ self.index = index
29
+ self.is_finished = False
30
+ self.has_error = False
31
+
32
+ def run(self) -> None:
33
+ try:
34
+ self.pipe.start(wait=True)
35
+ except Exception:
36
+ self.has_error = True
37
+ finally:
38
+ self.is_finished = True
39
+
40
+ def get_report(self) -> PipeReport:
41
+ report = self.pipe.report
42
+ return PipeReport(
43
+ pipe_index=self.index,
44
+ total_processed=report.total_processed,
45
+ success_count=report.success_count,
46
+ error_count=report.error_count,
47
+ ram_bytes=report.ram_bytes,
48
+ is_finished=self.is_finished or report.is_finished,
49
+ has_error=self.has_error,
50
+ is_alive=not self.is_finished,
51
+ )
52
+
53
+
54
+ @ray.remote
55
+ def _install_dependencies(packages: list[str]) -> None:
56
+ install_dependencies(packages)
57
+
58
+
59
+ class RayEngine(BaseEngine):
60
+ """
61
+ Distributed execution engine using Ray.
62
+ """
63
+
64
+ def __init__(self, address: str | None = None, **kwargs: Any):
65
+ if not ray.is_initialized():
66
+ # Silence the accelerator visible devices warning for future Ray versions
67
+ os.environ.setdefault("RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO", "0")
68
+
69
+ # Prepare default runtime_env and get dependencies
70
+ runtime_env, deps = self._prepare_runtime_env(kwargs.pop("runtime_env", {}))
71
+
72
+ # Default to lean initialization
73
+ ray_args = {
74
+ "address": address,
75
+ "num_cpus": kwargs.pop("num_cpus", None),
76
+ "include_dashboard": kwargs.pop("include_dashboard", False),
77
+ "logging_level": kwargs.pop("logging_level", "error"),
78
+ "runtime_env": runtime_env,
79
+ **kwargs,
80
+ }
81
+ ray.init(**ray_args)
82
+
83
+ # Manually install dependencies on all nodes using our agnostic strategy
84
+ if deps:
85
+ self._install_deps_on_all_nodes(deps)
86
+
87
+ self._workers: list[Any] = []
88
+ self._futures: list[Any] = []
89
+ self._start_time: datetime | None = None
90
+ self._cached_report: FlowReport | None = None
91
+
92
+ def _install_deps_on_all_nodes(self, deps: list[str]) -> None:
93
+ """
94
+ Run the agnostic dependency installer on all connected Ray nodes.
95
+ This provides support for pip, uv, and poetry environments.
96
+ """
97
+ nodes = ray.nodes()
98
+ alive_nodes = [n for n in nodes if n.get("Alive")]
99
+
100
+ refs = []
101
+ for node in alive_nodes:
102
+ # We use resources placement group strategy to force execution
103
+ # on specific node. The 'node:<ip>' resource is automatically
104
+ # present on each node.
105
+ node_ip = node.get("NodeManagerAddress")
106
+ if node_ip:
107
+ refs.append(
108
+ _install_dependencies.options(
109
+ resources={f"node:{node_ip}": 0.001}
110
+ ).remote(deps)
111
+ )
112
+
113
+ if refs:
114
+ ray.get(refs)
115
+
116
+ def _prepare_runtime_env(
117
+ self, runtime_env: dict[str, Any]
118
+ ) -> tuple[dict[str, Any], list[str]]:
119
+ """
120
+ Configure the Ray runtime environment based on whether we are in
121
+ development mode or being used as a library.
122
+ Returns modified runtime_env and a list of dependencies to install manually.
123
+ """
124
+ # 1. Detect environment and versions
125
+ is_dev_mode = False
126
+ try:
127
+ # heuristic: if we are in the zoopipe repo and have the ABI, it's dev mode
128
+ if (
129
+ os.path.exists("src/zoopipe")
130
+ and os.path.exists("pyproject.toml")
131
+ and any(f.endswith(".so") for f in os.listdir("src/zoopipe"))
132
+ ):
133
+ is_dev_mode = True
134
+ except Exception:
135
+ pass
136
+
137
+ # 2. Setup pip dependencies
138
+ deps = []
139
+ if "pip" not in runtime_env:
140
+ if is_dev_mode:
141
+ # Dev mode: Extract dependencies from pyproject.toml (Source of Truth)
142
+ try:
143
+ with open("pyproject.toml", "r") as f:
144
+ toml_content = f.read()
145
+ # Find dependencies = [ ... ] block
146
+ match = re.search(
147
+ r"dependencies\s*=\s*\[(.*?)\]", toml_content, re.DOTALL
148
+ )
149
+ if match:
150
+ dep_block = match.group(1)
151
+ deps = re.findall(r'["\'](.*?)["\']', dep_block)
152
+ except Exception:
153
+ pass
154
+ else:
155
+ # User mode: zoopipe package will pull its own dependencies
156
+ try:
157
+ version = metadata.version("zoopipe")
158
+ deps.append(f"zoopipe=={version}")
159
+ except metadata.PackageNotFoundError:
160
+ # Fallback to hardcoded core if everything fails
161
+ deps = ["pydantic>=2.0"]
162
+
163
+ # NOTE: We DO NOT set 'pip' in runtime_env because we want to use our
164
+ # agnostic installer.
165
+ # runtime_env["pip"] = deps <-- REMOVED
166
+
167
+ # 3. Ship code and binaries
168
+ if "working_dir" not in runtime_env:
169
+ runtime_env["working_dir"] = "."
170
+
171
+ # In dev mode, we need src/ in PYTHONPATH to find the local zoopipe
172
+ if is_dev_mode:
173
+ env_vars = runtime_env.get("env_vars", {})
174
+ if "PYTHONPATH" not in env_vars:
175
+ # Ray adds working_dir to sys.path,
176
+ # but we need src/ for 'import zoopipe'
177
+ env_vars["PYTHONPATH"] = "./src"
178
+ runtime_env["env_vars"] = env_vars
179
+
180
+ return runtime_env
181
+
182
+ def start(self, pipes: list[Pipe]) -> None:
183
+ if self.is_running:
184
+ raise RuntimeError("RayEngine is already running")
185
+
186
+ self._start_time = datetime.now()
187
+ self._workers = [RayPipeWorker.remote(pipe, i) for i, pipe in enumerate(pipes)]
188
+ self._futures = [w.run.remote() for w in self._workers]
189
+ self._cached_report = None
190
+
191
+ def wait(self, timeout: float | None = None) -> bool:
192
+ if not self._futures:
193
+ return True
194
+
195
+ ready, _ = ray.wait(
196
+ self._futures, num_returns=len(self._futures), timeout=timeout
197
+ )
198
+ return len(ready) == len(self._futures)
199
+
200
+ def shutdown(self, timeout: float = 5.0) -> None:
201
+ for worker in self._workers:
202
+ ray.kill(worker)
203
+ self._workers = []
204
+ self._futures = []
205
+ self._cached_report = None
206
+
207
+ @property
208
+ def is_running(self) -> bool:
209
+ if not self._futures:
210
+ return False
211
+ ready, _ = ray.wait(self._futures, num_returns=len(self._futures), timeout=0)
212
+ return len(ready) < len(self._futures)
213
+
214
+ @property
215
+ def report(self) -> FlowReport:
216
+ if self._cached_report and self._cached_report.is_finished:
217
+ return self._cached_report
218
+
219
+ report = FlowReport()
220
+ report.start_time = self._start_time
221
+
222
+ p_reports = self.pipe_reports
223
+ for pr in p_reports:
224
+ report.total_processed += pr.total_processed
225
+ report.success_count += pr.success_count
226
+ report.error_count += pr.error_count
227
+ report.ram_bytes += pr.ram_bytes
228
+
229
+ all_finished = all(pr.is_finished for pr in p_reports)
230
+ any_error = any(pr.has_error for pr in p_reports)
231
+
232
+ if all_finished:
233
+ report.status = FlowStatus.FAILED if any_error else FlowStatus.COMPLETED
234
+ report.end_time = datetime.now()
235
+ report._finished_event.set()
236
+ self._cached_report = report
237
+ else:
238
+ report.status = FlowStatus.RUNNING
239
+
240
+ return report
241
+
242
+ @property
243
+ def pipe_reports(self) -> list[PipeReport]:
244
+ if not self._workers:
245
+ return []
246
+ # Centralized collection from all actors in one pass
247
+ return ray.get([w.get_report.remote() for w in self._workers])
248
+
249
+ def get_pipe_report(self, index: int) -> PipeReport:
250
+ if not self._workers:
251
+ raise RuntimeError("Engine has not been started")
252
+ return ray.get(self._workers[index].get_report.remote())
@@ -0,0 +1,4 @@
1
+ from zoopipe.hooks.base import BaseHook, HookPriority, HookStore
2
+ from zoopipe.hooks.sql import SQLExpansionHook
3
+
4
+ __all__ = ["BaseHook", "HookStore", "HookPriority", "SQLExpansionHook"]
zoopipe/hooks/base.py ADDED
@@ -0,0 +1,70 @@
1
+ import typing
2
+
3
+ from zoopipe.report import EntryTypedDict
4
+
5
+ #: Type alias for the shared state between hooks.
6
+ HookStore = dict[str, typing.Any]
7
+
8
+
9
+ class HookPriority:
10
+ """
11
+ Standard priority levels for hooks.
12
+
13
+ Lower values correspond to higher priority (run earlier).
14
+ """
15
+
16
+ VERY_HIGH = 0
17
+ HIGH = 25
18
+ NORMAL = 50
19
+ LOW = 75
20
+ VERY_LOW = 100
21
+
22
+
23
+ class BaseHook:
24
+ """
25
+ Abstract base class for pipeline lifecycle hooks.
26
+
27
+ Hooks allow executing custom Python logic at different stages of
28
+ the pipeline (setup, batch execution, and teardown). They can maintain
29
+ state between batches using the shared HookStore.
30
+ """
31
+
32
+ def __init__(self, priority: int = HookPriority.NORMAL):
33
+ """
34
+ Initialize the hook with a specific priority.
35
+
36
+ Args:
37
+ priority: Execution order (lower values run first).
38
+ """
39
+ self.priority = priority
40
+
41
+ def setup(self, store: HookStore) -> None:
42
+ """
43
+ Called once before the pipeline starts processing data.
44
+
45
+ Use this to initialize connections, resources, or shared state.
46
+ """
47
+ pass
48
+
49
+ def execute(
50
+ self, entries: list[EntryTypedDict], store: HookStore
51
+ ) -> list[EntryTypedDict]:
52
+ """
53
+ Process a batch of entries.
54
+
55
+ This method is where transformations or decorations happen.
56
+ It can modify the entries in-place or return a new list.
57
+
58
+ Args:
59
+ entries: List of dictionaries representing pipeline items.
60
+ store: Shared state between different hooks and different batches.
61
+ """
62
+ return entries
63
+
64
+ def teardown(self, store: HookStore) -> None:
65
+ """
66
+ Called once after the pipeline finish or if an error occurs.
67
+
68
+ Use this to release resources, close connections, or log final stats.
69
+ """
70
+ pass
zoopipe/hooks/sql.py ADDED
@@ -0,0 +1,94 @@
1
+ import typing
2
+
3
+ from zoopipe.hooks.base import BaseHook, HookStore
4
+ from zoopipe.report import EntryStatus, get_logger
5
+
6
+ if typing.TYPE_CHECKING:
7
+ from zoopipe.report import EntryTypedDict
8
+
9
+
10
+ class SQLExpansionHook(BaseHook):
11
+ """
12
+ Expands anchor records (e.g., ID ranges) into full records by querying a SQL table.
13
+
14
+ This hook is designed to work with chunked data ingestion. It takes minimal
15
+ identifying information (anchors) and performs a bulk fetch from the database
16
+ to retrieve the complete rows.
17
+ """
18
+
19
+ def __init__(
20
+ self, connection_factory: typing.Callable[[], typing.Any], table_name: str
21
+ ):
22
+ """
23
+ Initialize the SQLExpansionHook.
24
+
25
+ Args:
26
+ connection_factory: Callable that returns a database connection.
27
+ table_name: Name of the SQL table to fetch data from.
28
+ """
29
+ super().__init__()
30
+ self.connection_factory = connection_factory
31
+ self.table_name = table_name
32
+ self.logger = get_logger()
33
+
34
+ def setup(self, store: HookStore) -> None:
35
+ pass
36
+
37
+ def execute(
38
+ self, entries: list["EntryTypedDict"], store: HookStore
39
+ ) -> list["EntryTypedDict"]:
40
+ expanded = []
41
+ conn = self.connection_factory()
42
+
43
+ try:
44
+ cursor = conn.cursor()
45
+ self.logger.debug(
46
+ f"SQLExpansionHook: Expanding batch of {len(entries)} anchor(s)"
47
+ )
48
+
49
+ for anchor in entries:
50
+ raw = anchor["raw_data"]
51
+ min_id = raw.get("min_id")
52
+ max_id = raw.get("max_id")
53
+
54
+ if min_id is None or max_id is None:
55
+ continue
56
+
57
+ cursor.execute(
58
+ f"SELECT * FROM {self.table_name} WHERE id BETWEEN ? AND ?",
59
+ (min_id, max_id),
60
+ )
61
+
62
+ columns = (
63
+ [column[0] for column in cursor.description]
64
+ if cursor.description
65
+ else []
66
+ )
67
+
68
+ rows = cursor.fetchall()
69
+
70
+ for row in rows:
71
+ if columns:
72
+ data = dict(zip(columns, row))
73
+ else:
74
+ data = dict(row)
75
+
76
+ expanded.append(
77
+ {
78
+ "id": None,
79
+ "position": None,
80
+ "status": EntryStatus.PENDING,
81
+ "raw_data": data,
82
+ "validated_data": None,
83
+ "metadata": anchor["metadata"],
84
+ "errors": [],
85
+ }
86
+ )
87
+ cursor.close()
88
+ finally:
89
+ conn.close()
90
+
91
+ return expanded
92
+
93
+ def teardown(self, store: HookStore) -> None:
94
+ pass
@@ -0,0 +1,24 @@
1
+ from zoopipe.input_adapter.arrow import ArrowInputAdapter
2
+ from zoopipe.input_adapter.base import BaseInputAdapter
3
+ from zoopipe.input_adapter.csv import CSVInputAdapter
4
+ from zoopipe.input_adapter.duckdb import DuckDBInputAdapter
5
+ from zoopipe.input_adapter.excel import ExcelInputAdapter
6
+ from zoopipe.input_adapter.json import JSONInputAdapter
7
+ from zoopipe.input_adapter.kafka import KafkaInputAdapter
8
+ from zoopipe.input_adapter.parquet import ParquetInputAdapter
9
+ from zoopipe.input_adapter.pygen import PyGeneratorInputAdapter
10
+ from zoopipe.input_adapter.sql import SQLInputAdapter, SQLPaginationInputAdapter
11
+
12
+ __all__ = [
13
+ "BaseInputAdapter",
14
+ "CSVInputAdapter",
15
+ "JSONInputAdapter",
16
+ "DuckDBInputAdapter",
17
+ "ArrowInputAdapter",
18
+ "ExcelInputAdapter",
19
+ "SQLInputAdapter",
20
+ "SQLPaginationInputAdapter",
21
+ "ParquetInputAdapter",
22
+ "PyGeneratorInputAdapter",
23
+ "KafkaInputAdapter",
24
+ ]
@@ -0,0 +1,38 @@
1
+ import pathlib
2
+ import typing
3
+
4
+ from zoopipe.input_adapter.base import BaseInputAdapter
5
+ from zoopipe.zoopipe_rust_core import ArrowReader
6
+
7
+
8
+ class ArrowInputAdapter(BaseInputAdapter):
9
+ """
10
+ Reads records from Apache Arrow IPC (feather) files.
11
+
12
+ Provides high-speed sequential access to Arrow data with minimal
13
+ serialization overhead.
14
+ """
15
+
16
+ def __init__(
17
+ self,
18
+ source: typing.Union[str, pathlib.Path],
19
+ generate_ids: bool = True,
20
+ ):
21
+ """
22
+ Initialize the ArrowInputAdapter.
23
+
24
+ Args:
25
+ source: Path to the Arrow file.
26
+ generate_ids: Whether to generate unique IDs for each record.
27
+ """
28
+ self.source_path = str(source)
29
+ self.generate_ids = generate_ids
30
+
31
+ def get_native_reader(self) -> ArrowReader:
32
+ return ArrowReader(
33
+ self.source_path,
34
+ generate_ids=self.generate_ids,
35
+ )
36
+
37
+
38
+ __all__ = ["ArrowInputAdapter"]
@@ -0,0 +1,48 @@
1
+ import abc
2
+ import typing
3
+
4
+
5
+ class BaseInputAdapter(abc.ABC):
6
+ """
7
+ Abstract base class for all input adapters.
8
+
9
+ Input adapters are responsible for providing a native Rust reader
10
+ and optional hooks that are specific to the data source.
11
+ """
12
+
13
+ @abc.abstractmethod
14
+ def get_native_reader(self) -> typing.Any:
15
+ """
16
+ Return the underlying Rust reader instance.
17
+
18
+ This reader must implement the common reader interface in Rust
19
+ to be compatible with the NativePipe.
20
+ """
21
+ raise NotImplementedError
22
+
23
+ def get_hooks(self) -> list[typing.Any]:
24
+ """
25
+ Return a list of hooks to be executed by the pipeline.
26
+
27
+ Typically used for pre-fetching data or expanding anchor records
28
+ before they reach the main processing stage.
29
+ """
30
+ return []
31
+
32
+ @property
33
+ def can_split(self) -> bool:
34
+ """Return True if this adapter supports parallel splitting."""
35
+ return type(self).split != BaseInputAdapter.split
36
+
37
+ def split(self, workers: int) -> typing.List["BaseInputAdapter"]:
38
+ """
39
+ Split the input adapter into `workers` shards for parallel processing.
40
+
41
+ Args:
42
+ workers: Number of partitions to create.
43
+
44
+ Returns:
45
+ A list of input adapters, each responsible for a subset of the data.
46
+ Default implementation returns [self] (no splitting).
47
+ """
48
+ return [self]
@@ -0,0 +1,144 @@
1
+ import csv
2
+ import pathlib
3
+ import typing
4
+
5
+ from zoopipe.input_adapter.base import BaseInputAdapter
6
+ from zoopipe.zoopipe_rust_core import CSVReader, get_file_size
7
+
8
+
9
+ class CSVInputAdapter(BaseInputAdapter):
10
+ """
11
+ A high-performance CSV reader supporting both local and S3 sources.
12
+
13
+ Uses a multi-threaded parser in the Rust core to ensure fast data ingestion
14
+ without blocking the Python GIL.
15
+ """
16
+
17
+ def __init__(
18
+ self,
19
+ source: typing.Union[str, pathlib.Path],
20
+ delimiter: str = ",",
21
+ quotechar: str = '"',
22
+ skip_rows: int = 0,
23
+ fieldnames: list[str] | None = None,
24
+ generate_ids: bool = True,
25
+ limit: int | None = None,
26
+ start_byte: int = 0,
27
+ end_byte: int | None = None,
28
+ ):
29
+ """
30
+ Initialize the CSVInputAdapter.
31
+
32
+ Args:
33
+ source: Path to the CSV file or S3 URI.
34
+ delimiter: Column separator.
35
+ quotechar: Character used for quoting fields.
36
+ skip_rows: Number of rows to skip at the beginning.
37
+ fieldnames: Optional list of column names.
38
+ generate_ids: Whether to generate unique IDs for each record.
39
+ limit: Maximum number of rows to read (optional).
40
+ start_byte: Byte offset to start reading from.
41
+ end_byte: Byte offset to stop reading at.
42
+ """
43
+ self.source_path = str(source)
44
+ self.delimiter = delimiter
45
+ self.quotechar = quotechar
46
+ self.skip_rows = skip_rows
47
+ self.fieldnames = fieldnames
48
+ self.generate_ids = generate_ids
49
+ self.limit = limit
50
+ self.start_byte = start_byte
51
+ self.end_byte = end_byte
52
+
53
+ def split(self, workers: int) -> typing.List["CSVInputAdapter"]:
54
+ """
55
+ Split the CSV input into `workers` byte-range shards.
56
+ """
57
+
58
+ file_size = get_file_size(self.source_path)
59
+
60
+ chunk_size = file_size // workers
61
+
62
+ # Ensure we have fieldnames if not explicitly provided
63
+ # This is CRITICAL for partial reads (start_byte > 0)
64
+ final_fieldnames = self.fieldnames
65
+ if final_fieldnames is None:
66
+ if self.source_path.startswith("s3://"):
67
+ # Use Rust reader to discover headers from S3
68
+ final_fieldnames = self.get_native_reader().headers
69
+ else:
70
+ with open(self.source_path, "r") as f:
71
+ reader = csv.reader(
72
+ f, delimiter=self.delimiter, quotechar=self.quotechar
73
+ )
74
+ try:
75
+ final_fieldnames = next(reader)
76
+ except StopIteration:
77
+ final_fieldnames = []
78
+
79
+ shards = []
80
+ for i in range(workers):
81
+ start = i * chunk_size
82
+ # Last worker takes rest of file
83
+ end = (i + 1) * chunk_size if i < workers - 1 else None
84
+
85
+ shards.append(
86
+ self.__class__(
87
+ source=self.source_path,
88
+ delimiter=self.delimiter,
89
+ quotechar=self.quotechar,
90
+ skip_rows=self.skip_rows,
91
+ fieldnames=final_fieldnames,
92
+ generate_ids=self.generate_ids,
93
+ limit=self.limit,
94
+ start_byte=start,
95
+ end_byte=end,
96
+ )
97
+ )
98
+ return shards
99
+
100
+ def get_native_reader(self) -> CSVReader:
101
+ # Pass start_byte and end_byte
102
+ return CSVReader(
103
+ self.source_path,
104
+ delimiter=ord(self.delimiter),
105
+ quote=ord(self.quotechar),
106
+ skip_rows=self.skip_rows,
107
+ fieldnames=self.fieldnames,
108
+ generate_ids=self.generate_ids,
109
+ limit=self.limit,
110
+ start_byte=self.start_byte,
111
+ end_byte=self.end_byte,
112
+ )
113
+
114
+ @staticmethod
115
+ def count_rows(
116
+ source: str | pathlib.Path,
117
+ delimiter: str = ",",
118
+ quotechar: str = '"',
119
+ has_header: bool = True,
120
+ ) -> int:
121
+ """
122
+ Efficiently count the number of rows in a CSV file using the Rust core.
123
+
124
+ Args:
125
+ source: Path to the CSV file.
126
+ delimiter: Column separator. (Default: ',')
127
+ quotechar: Character used for quoting. (Default: '"')
128
+ has_header: Whether the file has a header row to ignore in user count
129
+ (if used in context where header matters, though CSVReader.count_rows
130
+ name implies all records).
131
+ Actually pass this to rust to decide if first row is record or header.
132
+
133
+ Returns:
134
+ Number of rows (records).
135
+ """
136
+ return CSVReader.count_rows(
137
+ str(source),
138
+ ord(delimiter),
139
+ ord(quotechar),
140
+ has_header,
141
+ )
142
+
143
+
144
+ __all__ = ["CSVInputAdapter"]