tellaro-query-language 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tellaro_query_language-0.2.5.dist-info/LICENSE +72 -0
- tellaro_query_language-0.2.5.dist-info/METADATA +806 -0
- {tellaro_query_language-0.2.3.dist-info → tellaro_query_language-0.2.5.dist-info}/RECORD +21 -18
- {tellaro_query_language-0.2.3.dist-info → tellaro_query_language-0.2.5.dist-info}/entry_points.txt +1 -0
- tql/cache/memory.py +1 -1
- tql/cli.py +484 -0
- tql/core.py +244 -5
- tql/evaluator.py +1 -1
- tql/evaluator_components/special_expressions.py +62 -10
- tql/evaluator_components/value_comparison.py +0 -4
- tql/exceptions.py +6 -4
- tql/field_type_inference.py +285 -0
- tql/mutators/geo.py +57 -20
- tql/opensearch_components/query_converter.py +1 -1
- tql/opensearch_stats.py +7 -6
- tql/parser.py +7 -3
- tql/post_processor.py +8 -4
- tql/scripts.py +3 -3
- tql/stats_evaluator.py +357 -5
- tql/streaming_file_processor.py +335 -0
- tellaro_query_language-0.2.3.dist-info/LICENSE +0 -21
- tellaro_query_language-0.2.3.dist-info/METADATA +0 -433
- {tellaro_query_language-0.2.3.dist-info → tellaro_query_language-0.2.5.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
"""Streaming file processor for efficient line-by-line data processing.
|
|
2
|
+
|
|
3
|
+
This module provides generator-based file processing to handle large files
|
|
4
|
+
with minimal memory footprint, supporting JSON, JSONL, and CSV formats.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import csv
|
|
8
|
+
import glob
|
|
9
|
+
import json
|
|
10
|
+
import os
|
|
11
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
12
|
+
from typing import Any, Dict, Generator, List, Optional, Tuple
|
|
13
|
+
|
|
14
|
+
from .exceptions import TQLExecutionError
|
|
15
|
+
from .field_type_inference import FieldTypeInferencer
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class StreamingFileProcessor:
|
|
19
|
+
"""Processes files in a streaming fashion with minimal memory usage."""
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
sample_size: int = 100,
|
|
24
|
+
csv_delimiter: str = ",",
|
|
25
|
+
field_types: Optional[Dict[str, str]] = None,
|
|
26
|
+
csv_headers: Optional[List[str]] = None,
|
|
27
|
+
no_header: bool = False,
|
|
28
|
+
):
|
|
29
|
+
"""Initialize the streaming processor.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
sample_size: Number of records to sample for type inference
|
|
33
|
+
csv_delimiter: CSV delimiter character
|
|
34
|
+
field_types: Manual field type mappings
|
|
35
|
+
csv_headers: Manual CSV header names
|
|
36
|
+
no_header: Force CSV to be treated as having no header row
|
|
37
|
+
"""
|
|
38
|
+
self.sample_size = sample_size
|
|
39
|
+
self.csv_delimiter = csv_delimiter
|
|
40
|
+
self.field_types = field_types or {}
|
|
41
|
+
self.csv_headers = csv_headers
|
|
42
|
+
self.no_header = no_header
|
|
43
|
+
self.type_inferencer = FieldTypeInferencer(sample_size=sample_size)
|
|
44
|
+
|
|
45
|
+
def process_file(self, file_path: str, input_format: str = "auto") -> Generator[Dict[str, Any], None, None]:
|
|
46
|
+
"""Process a single file in streaming mode.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
file_path: Path to file
|
|
50
|
+
input_format: File format ('json', 'jsonl', 'csv', 'auto')
|
|
51
|
+
|
|
52
|
+
Yields:
|
|
53
|
+
Parsed records as dictionaries
|
|
54
|
+
|
|
55
|
+
Raises:
|
|
56
|
+
TQLExecutionError: If file processing fails
|
|
57
|
+
"""
|
|
58
|
+
if not os.path.exists(file_path):
|
|
59
|
+
raise TQLExecutionError(f"File not found: {file_path}")
|
|
60
|
+
|
|
61
|
+
# Auto-detect format if needed
|
|
62
|
+
if input_format == "auto":
|
|
63
|
+
input_format = self._detect_format(file_path)
|
|
64
|
+
|
|
65
|
+
# Route to appropriate processor
|
|
66
|
+
if input_format == "json":
|
|
67
|
+
yield from self._process_json_stream(file_path)
|
|
68
|
+
elif input_format == "jsonl":
|
|
69
|
+
yield from self._process_jsonl_stream(file_path)
|
|
70
|
+
elif input_format == "csv":
|
|
71
|
+
yield from self._process_csv_stream(file_path)
|
|
72
|
+
else:
|
|
73
|
+
raise TQLExecutionError(f"Unsupported format: {input_format}")
|
|
74
|
+
|
|
75
|
+
def process_folder(
|
|
76
|
+
self,
|
|
77
|
+
folder_path: str,
|
|
78
|
+
pattern: str = "*",
|
|
79
|
+
input_format: str = "auto",
|
|
80
|
+
recursive: bool = False,
|
|
81
|
+
parallel: int = 1,
|
|
82
|
+
) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
|
|
83
|
+
"""Process multiple files in a folder.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
folder_path: Path to folder
|
|
87
|
+
pattern: Glob pattern for file matching
|
|
88
|
+
input_format: File format
|
|
89
|
+
recursive: Process subdirectories recursively
|
|
90
|
+
parallel: Number of parallel workers (1 = sequential)
|
|
91
|
+
|
|
92
|
+
Yields:
|
|
93
|
+
Tuples of (file_path, record)
|
|
94
|
+
|
|
95
|
+
Raises:
|
|
96
|
+
TQLExecutionError: If folder processing fails
|
|
97
|
+
"""
|
|
98
|
+
if not os.path.exists(folder_path):
|
|
99
|
+
raise TQLExecutionError(f"Folder not found: {folder_path}")
|
|
100
|
+
|
|
101
|
+
if not os.path.isdir(folder_path):
|
|
102
|
+
raise TQLExecutionError(f"Not a directory: {folder_path}")
|
|
103
|
+
|
|
104
|
+
# Build glob pattern
|
|
105
|
+
if recursive:
|
|
106
|
+
glob_pattern = os.path.join(folder_path, "**", pattern)
|
|
107
|
+
else:
|
|
108
|
+
glob_pattern = os.path.join(folder_path, pattern)
|
|
109
|
+
|
|
110
|
+
# Get matching files
|
|
111
|
+
matching_files = glob.glob(glob_pattern, recursive=recursive)
|
|
112
|
+
matching_files = [f for f in matching_files if os.path.isfile(f)]
|
|
113
|
+
|
|
114
|
+
if not matching_files:
|
|
115
|
+
raise TQLExecutionError(f"No files found matching pattern: {glob_pattern}")
|
|
116
|
+
|
|
117
|
+
if parallel <= 1:
|
|
118
|
+
# Sequential processing
|
|
119
|
+
for file_path in matching_files:
|
|
120
|
+
for record in self.process_file(file_path, input_format):
|
|
121
|
+
yield (file_path, record)
|
|
122
|
+
else:
|
|
123
|
+
# Parallel processing
|
|
124
|
+
yield from self._process_files_parallel(matching_files, input_format, parallel)
|
|
125
|
+
|
|
126
|
+
def _process_files_parallel(
|
|
127
|
+
self, file_paths: List[str], input_format: str, parallel: int
|
|
128
|
+
) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
|
|
129
|
+
"""Process files in parallel using ThreadPoolExecutor.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
file_paths: List of file paths
|
|
133
|
+
input_format: File format
|
|
134
|
+
parallel: Number of workers
|
|
135
|
+
|
|
136
|
+
Yields:
|
|
137
|
+
Tuples of (file_path, record)
|
|
138
|
+
"""
|
|
139
|
+
|
|
140
|
+
def process_single_file(file_path: str) -> Tuple[str, List[Dict[str, Any]]]:
|
|
141
|
+
"""Process a single file and return results."""
|
|
142
|
+
records = list(self.process_file(file_path, input_format))
|
|
143
|
+
return (file_path, records)
|
|
144
|
+
|
|
145
|
+
with ThreadPoolExecutor(max_workers=parallel) as executor:
|
|
146
|
+
# Submit all files
|
|
147
|
+
futures = {executor.submit(process_single_file, fp): fp for fp in file_paths}
|
|
148
|
+
|
|
149
|
+
# Yield results as they complete
|
|
150
|
+
for future in as_completed(futures):
|
|
151
|
+
file_path, records = future.result()
|
|
152
|
+
for record in records:
|
|
153
|
+
yield (file_path, record)
|
|
154
|
+
|
|
155
|
+
def _detect_format(self, file_path: str) -> str:
|
|
156
|
+
"""Detect file format from extension.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
file_path: Path to file
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
Detected format ('json', 'jsonl', or 'csv')
|
|
163
|
+
"""
|
|
164
|
+
_, ext = os.path.splitext(file_path.lower())
|
|
165
|
+
|
|
166
|
+
if ext == ".json":
|
|
167
|
+
return "json"
|
|
168
|
+
elif ext in [".jsonl", ".ndjson"]:
|
|
169
|
+
return "jsonl"
|
|
170
|
+
elif ext == ".csv":
|
|
171
|
+
return "csv"
|
|
172
|
+
else:
|
|
173
|
+
# Default to JSONL for unknown extensions
|
|
174
|
+
return "jsonl"
|
|
175
|
+
|
|
176
|
+
def _process_json_stream(self, file_path: str) -> Generator[Dict[str, Any], None, None]:
|
|
177
|
+
"""Process JSON file (array format) in streaming mode.
|
|
178
|
+
|
|
179
|
+
For large JSON arrays, this attempts to parse incrementally.
|
|
180
|
+
Falls back to full load for small files.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
file_path: Path to JSON file
|
|
184
|
+
|
|
185
|
+
Yields:
|
|
186
|
+
Parsed records
|
|
187
|
+
"""
|
|
188
|
+
try:
|
|
189
|
+
# For JSON arrays, we need to load the full file
|
|
190
|
+
# TODO: Implement true streaming JSON array parser using ijson library
|
|
191
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
192
|
+
data = json.load(f)
|
|
193
|
+
|
|
194
|
+
if isinstance(data, dict):
|
|
195
|
+
yield data
|
|
196
|
+
elif isinstance(data, list):
|
|
197
|
+
for record in data:
|
|
198
|
+
if isinstance(record, dict):
|
|
199
|
+
yield record
|
|
200
|
+
else:
|
|
201
|
+
raise TQLExecutionError(f"Invalid JSON structure in {file_path}")
|
|
202
|
+
|
|
203
|
+
except json.JSONDecodeError as e:
|
|
204
|
+
raise TQLExecutionError(f"JSON parsing error in {file_path}: {e}")
|
|
205
|
+
except Exception as e:
|
|
206
|
+
raise TQLExecutionError(f"Error reading {file_path}: {e}")
|
|
207
|
+
|
|
208
|
+
def _process_jsonl_stream(self, file_path: str) -> Generator[Dict[str, Any], None, None]:
|
|
209
|
+
"""Process JSONL file (one JSON object per line) in streaming mode.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
file_path: Path to JSONL file
|
|
213
|
+
|
|
214
|
+
Yields:
|
|
215
|
+
Parsed records
|
|
216
|
+
"""
|
|
217
|
+
try:
|
|
218
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
219
|
+
for line_num, line in enumerate(f, 1):
|
|
220
|
+
line = line.strip()
|
|
221
|
+
if not line:
|
|
222
|
+
continue # Skip empty lines
|
|
223
|
+
|
|
224
|
+
try:
|
|
225
|
+
record = json.loads(line)
|
|
226
|
+
if isinstance(record, dict):
|
|
227
|
+
yield record
|
|
228
|
+
except json.JSONDecodeError as e:
|
|
229
|
+
# Log warning but continue processing
|
|
230
|
+
print(f"Warning: Invalid JSON on line {line_num} in {file_path}: {e}")
|
|
231
|
+
continue
|
|
232
|
+
|
|
233
|
+
except Exception as e:
|
|
234
|
+
raise TQLExecutionError(f"Error reading {file_path}: {e}")
|
|
235
|
+
|
|
236
|
+
def _process_csv_stream(self, file_path: str) -> Generator[Dict[str, Any], None, None]: # noqa: C901
|
|
237
|
+
"""Process CSV file in streaming mode with type inference.
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
file_path: Path to CSV file
|
|
241
|
+
|
|
242
|
+
Yields:
|
|
243
|
+
Parsed records with typed values
|
|
244
|
+
"""
|
|
245
|
+
try:
|
|
246
|
+
# First pass: determine headers and infer types
|
|
247
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
248
|
+
reader = csv.reader(f, delimiter=self.csv_delimiter)
|
|
249
|
+
|
|
250
|
+
# Get first row
|
|
251
|
+
try:
|
|
252
|
+
first_row = next(reader)
|
|
253
|
+
except StopIteration:
|
|
254
|
+
return # Empty file
|
|
255
|
+
|
|
256
|
+
# Determine headers
|
|
257
|
+
has_headers = False
|
|
258
|
+
headers = None
|
|
259
|
+
|
|
260
|
+
if self.csv_headers:
|
|
261
|
+
# Manual headers provided - first row is data
|
|
262
|
+
headers = self.csv_headers
|
|
263
|
+
has_headers = False
|
|
264
|
+
elif self.no_header:
|
|
265
|
+
# No headers, generate column names
|
|
266
|
+
headers = [f"column{i + 1}" for i in range(len(first_row))]
|
|
267
|
+
has_headers = False
|
|
268
|
+
else:
|
|
269
|
+
# Auto-detect headers
|
|
270
|
+
try:
|
|
271
|
+
second_row = next(reader)
|
|
272
|
+
has_headers = self.type_inferencer.detect_csv_headers(first_row, second_row)
|
|
273
|
+
|
|
274
|
+
if has_headers:
|
|
275
|
+
headers = first_row
|
|
276
|
+
else:
|
|
277
|
+
headers = [f"column{i + 1}" for i in range(len(first_row))]
|
|
278
|
+
except StopIteration:
|
|
279
|
+
# Only one row, treat as headers
|
|
280
|
+
headers = first_row
|
|
281
|
+
return
|
|
282
|
+
|
|
283
|
+
# Sample data for type inference if no manual types provided
|
|
284
|
+
inferred_types = self.field_types or {}
|
|
285
|
+
if not self.field_types:
|
|
286
|
+
# Rewind and collect sample
|
|
287
|
+
f.seek(0)
|
|
288
|
+
reader = csv.reader(f, delimiter=self.csv_delimiter)
|
|
289
|
+
|
|
290
|
+
# Skip header row only if file has headers (not manual)
|
|
291
|
+
if has_headers:
|
|
292
|
+
try:
|
|
293
|
+
next(reader)
|
|
294
|
+
except StopIteration:
|
|
295
|
+
return # Empty file with only headers
|
|
296
|
+
|
|
297
|
+
# Collect sample
|
|
298
|
+
sample_records = []
|
|
299
|
+
for i, row in enumerate(reader):
|
|
300
|
+
if i >= self.sample_size:
|
|
301
|
+
break
|
|
302
|
+
if len(row) == len(headers):
|
|
303
|
+
record = dict(zip(headers, row))
|
|
304
|
+
sample_records.append(record)
|
|
305
|
+
|
|
306
|
+
# Infer types from sample
|
|
307
|
+
if sample_records:
|
|
308
|
+
inferred_types = self.type_inferencer.infer_from_records(sample_records)
|
|
309
|
+
|
|
310
|
+
# Second pass: process all rows with type conversion
|
|
311
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
312
|
+
reader = csv.reader(f, delimiter=self.csv_delimiter)
|
|
313
|
+
|
|
314
|
+
# Skip header row only if file has headers (not manual)
|
|
315
|
+
if has_headers:
|
|
316
|
+
try:
|
|
317
|
+
next(reader)
|
|
318
|
+
except StopIteration:
|
|
319
|
+
return # Empty file with only headers
|
|
320
|
+
|
|
321
|
+
# Process all rows
|
|
322
|
+
for row in reader:
|
|
323
|
+
if len(row) != len(headers):
|
|
324
|
+
continue # Skip malformed rows
|
|
325
|
+
|
|
326
|
+
# Convert row to dict with type conversion
|
|
327
|
+
record = {}
|
|
328
|
+
for header, value in zip(headers, row):
|
|
329
|
+
field_type = inferred_types.get(header, "string")
|
|
330
|
+
record[header] = self.type_inferencer.convert_value(value, field_type)
|
|
331
|
+
|
|
332
|
+
yield record
|
|
333
|
+
|
|
334
|
+
except Exception as e:
|
|
335
|
+
raise TQLExecutionError(f"Error reading CSV file {file_path}: {e}")
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
MIT License
|
|
2
|
-
|
|
3
|
-
Copyright (c) 2024 Tellaro
|
|
4
|
-
|
|
5
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
-
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
-
in the Software without restriction, including without limitation the rights
|
|
8
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
-
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
-
furnished to do so, subject to the following conditions:
|
|
11
|
-
|
|
12
|
-
The above copyright notice and this permission notice shall be included in all
|
|
13
|
-
copies or substantial portions of the Software.
|
|
14
|
-
|
|
15
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
-
SOFTWARE.
|