tellaro-query-language 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,335 @@
1
+ """Streaming file processor for efficient line-by-line data processing.
2
+
3
+ This module provides generator-based file processing to handle large files
4
+ with minimal memory footprint, supporting JSON, JSONL, and CSV formats.
5
+ """
6
+
7
+ import csv
8
+ import glob
9
+ import json
10
+ import os
11
+ from concurrent.futures import ThreadPoolExecutor, as_completed
12
+ from typing import Any, Dict, Generator, List, Optional, Tuple
13
+
14
+ from .exceptions import TQLExecutionError
15
+ from .field_type_inference import FieldTypeInferencer
16
+
17
+
18
+ class StreamingFileProcessor:
19
+ """Processes files in a streaming fashion with minimal memory usage."""
20
+
21
+ def __init__(
22
+ self,
23
+ sample_size: int = 100,
24
+ csv_delimiter: str = ",",
25
+ field_types: Optional[Dict[str, str]] = None,
26
+ csv_headers: Optional[List[str]] = None,
27
+ no_header: bool = False,
28
+ ):
29
+ """Initialize the streaming processor.
30
+
31
+ Args:
32
+ sample_size: Number of records to sample for type inference
33
+ csv_delimiter: CSV delimiter character
34
+ field_types: Manual field type mappings
35
+ csv_headers: Manual CSV header names
36
+ no_header: Force CSV to be treated as having no header row
37
+ """
38
+ self.sample_size = sample_size
39
+ self.csv_delimiter = csv_delimiter
40
+ self.field_types = field_types or {}
41
+ self.csv_headers = csv_headers
42
+ self.no_header = no_header
43
+ self.type_inferencer = FieldTypeInferencer(sample_size=sample_size)
44
+
45
+ def process_file(self, file_path: str, input_format: str = "auto") -> Generator[Dict[str, Any], None, None]:
46
+ """Process a single file in streaming mode.
47
+
48
+ Args:
49
+ file_path: Path to file
50
+ input_format: File format ('json', 'jsonl', 'csv', 'auto')
51
+
52
+ Yields:
53
+ Parsed records as dictionaries
54
+
55
+ Raises:
56
+ TQLExecutionError: If file processing fails
57
+ """
58
+ if not os.path.exists(file_path):
59
+ raise TQLExecutionError(f"File not found: {file_path}")
60
+
61
+ # Auto-detect format if needed
62
+ if input_format == "auto":
63
+ input_format = self._detect_format(file_path)
64
+
65
+ # Route to appropriate processor
66
+ if input_format == "json":
67
+ yield from self._process_json_stream(file_path)
68
+ elif input_format == "jsonl":
69
+ yield from self._process_jsonl_stream(file_path)
70
+ elif input_format == "csv":
71
+ yield from self._process_csv_stream(file_path)
72
+ else:
73
+ raise TQLExecutionError(f"Unsupported format: {input_format}")
74
+
75
+ def process_folder(
76
+ self,
77
+ folder_path: str,
78
+ pattern: str = "*",
79
+ input_format: str = "auto",
80
+ recursive: bool = False,
81
+ parallel: int = 1,
82
+ ) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
83
+ """Process multiple files in a folder.
84
+
85
+ Args:
86
+ folder_path: Path to folder
87
+ pattern: Glob pattern for file matching
88
+ input_format: File format
89
+ recursive: Process subdirectories recursively
90
+ parallel: Number of parallel workers (1 = sequential)
91
+
92
+ Yields:
93
+ Tuples of (file_path, record)
94
+
95
+ Raises:
96
+ TQLExecutionError: If folder processing fails
97
+ """
98
+ if not os.path.exists(folder_path):
99
+ raise TQLExecutionError(f"Folder not found: {folder_path}")
100
+
101
+ if not os.path.isdir(folder_path):
102
+ raise TQLExecutionError(f"Not a directory: {folder_path}")
103
+
104
+ # Build glob pattern
105
+ if recursive:
106
+ glob_pattern = os.path.join(folder_path, "**", pattern)
107
+ else:
108
+ glob_pattern = os.path.join(folder_path, pattern)
109
+
110
+ # Get matching files
111
+ matching_files = glob.glob(glob_pattern, recursive=recursive)
112
+ matching_files = [f for f in matching_files if os.path.isfile(f)]
113
+
114
+ if not matching_files:
115
+ raise TQLExecutionError(f"No files found matching pattern: {glob_pattern}")
116
+
117
+ if parallel <= 1:
118
+ # Sequential processing
119
+ for file_path in matching_files:
120
+ for record in self.process_file(file_path, input_format):
121
+ yield (file_path, record)
122
+ else:
123
+ # Parallel processing
124
+ yield from self._process_files_parallel(matching_files, input_format, parallel)
125
+
126
+ def _process_files_parallel(
127
+ self, file_paths: List[str], input_format: str, parallel: int
128
+ ) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
129
+ """Process files in parallel using ThreadPoolExecutor.
130
+
131
+ Args:
132
+ file_paths: List of file paths
133
+ input_format: File format
134
+ parallel: Number of workers
135
+
136
+ Yields:
137
+ Tuples of (file_path, record)
138
+ """
139
+
140
+ def process_single_file(file_path: str) -> Tuple[str, List[Dict[str, Any]]]:
141
+ """Process a single file and return results."""
142
+ records = list(self.process_file(file_path, input_format))
143
+ return (file_path, records)
144
+
145
+ with ThreadPoolExecutor(max_workers=parallel) as executor:
146
+ # Submit all files
147
+ futures = {executor.submit(process_single_file, fp): fp for fp in file_paths}
148
+
149
+ # Yield results as they complete
150
+ for future in as_completed(futures):
151
+ file_path, records = future.result()
152
+ for record in records:
153
+ yield (file_path, record)
154
+
155
+ def _detect_format(self, file_path: str) -> str:
156
+ """Detect file format from extension.
157
+
158
+ Args:
159
+ file_path: Path to file
160
+
161
+ Returns:
162
+ Detected format ('json', 'jsonl', or 'csv')
163
+ """
164
+ _, ext = os.path.splitext(file_path.lower())
165
+
166
+ if ext == ".json":
167
+ return "json"
168
+ elif ext in [".jsonl", ".ndjson"]:
169
+ return "jsonl"
170
+ elif ext == ".csv":
171
+ return "csv"
172
+ else:
173
+ # Default to JSONL for unknown extensions
174
+ return "jsonl"
175
+
176
+ def _process_json_stream(self, file_path: str) -> Generator[Dict[str, Any], None, None]:
177
+ """Process JSON file (array format) in streaming mode.
178
+
179
+ For large JSON arrays, this attempts to parse incrementally.
180
+ Falls back to full load for small files.
181
+
182
+ Args:
183
+ file_path: Path to JSON file
184
+
185
+ Yields:
186
+ Parsed records
187
+ """
188
+ try:
189
+ # For JSON arrays, we need to load the full file
190
+ # TODO: Implement true streaming JSON array parser using ijson library
191
+ with open(file_path, "r", encoding="utf-8") as f:
192
+ data = json.load(f)
193
+
194
+ if isinstance(data, dict):
195
+ yield data
196
+ elif isinstance(data, list):
197
+ for record in data:
198
+ if isinstance(record, dict):
199
+ yield record
200
+ else:
201
+ raise TQLExecutionError(f"Invalid JSON structure in {file_path}")
202
+
203
+ except json.JSONDecodeError as e:
204
+ raise TQLExecutionError(f"JSON parsing error in {file_path}: {e}")
205
+ except Exception as e:
206
+ raise TQLExecutionError(f"Error reading {file_path}: {e}")
207
+
208
+ def _process_jsonl_stream(self, file_path: str) -> Generator[Dict[str, Any], None, None]:
209
+ """Process JSONL file (one JSON object per line) in streaming mode.
210
+
211
+ Args:
212
+ file_path: Path to JSONL file
213
+
214
+ Yields:
215
+ Parsed records
216
+ """
217
+ try:
218
+ with open(file_path, "r", encoding="utf-8") as f:
219
+ for line_num, line in enumerate(f, 1):
220
+ line = line.strip()
221
+ if not line:
222
+ continue # Skip empty lines
223
+
224
+ try:
225
+ record = json.loads(line)
226
+ if isinstance(record, dict):
227
+ yield record
228
+ except json.JSONDecodeError as e:
229
+ # Log warning but continue processing
230
+ print(f"Warning: Invalid JSON on line {line_num} in {file_path}: {e}")
231
+ continue
232
+
233
+ except Exception as e:
234
+ raise TQLExecutionError(f"Error reading {file_path}: {e}")
235
+
236
+ def _process_csv_stream(self, file_path: str) -> Generator[Dict[str, Any], None, None]: # noqa: C901
237
+ """Process CSV file in streaming mode with type inference.
238
+
239
+ Args:
240
+ file_path: Path to CSV file
241
+
242
+ Yields:
243
+ Parsed records with typed values
244
+ """
245
+ try:
246
+ # First pass: determine headers and infer types
247
+ with open(file_path, "r", encoding="utf-8") as f:
248
+ reader = csv.reader(f, delimiter=self.csv_delimiter)
249
+
250
+ # Get first row
251
+ try:
252
+ first_row = next(reader)
253
+ except StopIteration:
254
+ return # Empty file
255
+
256
+ # Determine headers
257
+ has_headers = False
258
+ headers = None
259
+
260
+ if self.csv_headers:
261
+ # Manual headers provided - first row is data
262
+ headers = self.csv_headers
263
+ has_headers = False
264
+ elif self.no_header:
265
+ # No headers, generate column names
266
+ headers = [f"column{i + 1}" for i in range(len(first_row))]
267
+ has_headers = False
268
+ else:
269
+ # Auto-detect headers
270
+ try:
271
+ second_row = next(reader)
272
+ has_headers = self.type_inferencer.detect_csv_headers(first_row, second_row)
273
+
274
+ if has_headers:
275
+ headers = first_row
276
+ else:
277
+ headers = [f"column{i + 1}" for i in range(len(first_row))]
278
+ except StopIteration:
279
+ # Only one row, treat as headers
280
+ headers = first_row
281
+ return
282
+
283
+ # Sample data for type inference if no manual types provided
284
+ inferred_types = self.field_types or {}
285
+ if not self.field_types:
286
+ # Rewind and collect sample
287
+ f.seek(0)
288
+ reader = csv.reader(f, delimiter=self.csv_delimiter)
289
+
290
+ # Skip header row only if file has headers (not manual)
291
+ if has_headers:
292
+ try:
293
+ next(reader)
294
+ except StopIteration:
295
+ return # Empty file with only headers
296
+
297
+ # Collect sample
298
+ sample_records = []
299
+ for i, row in enumerate(reader):
300
+ if i >= self.sample_size:
301
+ break
302
+ if len(row) == len(headers):
303
+ record = dict(zip(headers, row))
304
+ sample_records.append(record)
305
+
306
+ # Infer types from sample
307
+ if sample_records:
308
+ inferred_types = self.type_inferencer.infer_from_records(sample_records)
309
+
310
+ # Second pass: process all rows with type conversion
311
+ with open(file_path, "r", encoding="utf-8") as f:
312
+ reader = csv.reader(f, delimiter=self.csv_delimiter)
313
+
314
+ # Skip header row only if file has headers (not manual)
315
+ if has_headers:
316
+ try:
317
+ next(reader)
318
+ except StopIteration:
319
+ return # Empty file with only headers
320
+
321
+ # Process all rows
322
+ for row in reader:
323
+ if len(row) != len(headers):
324
+ continue # Skip malformed rows
325
+
326
+ # Convert row to dict with type conversion
327
+ record = {}
328
+ for header, value in zip(headers, row):
329
+ field_type = inferred_types.get(header, "string")
330
+ record[header] = self.type_inferencer.convert_value(value, field_type)
331
+
332
+ yield record
333
+
334
+ except Exception as e:
335
+ raise TQLExecutionError(f"Error reading CSV file {file_path}: {e}")
@@ -1,21 +0,0 @@
1
- MIT License
2
-
3
- Copyright (c) 2024 Tellaro
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in all
13
- copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.