bare-script 3.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bare_script/data.py ADDED
@@ -0,0 +1,477 @@
1
+ # Licensed under the MIT License
2
+ # https://github.com/craigahobbs/bare-script-py/blob/main/LICENSE
3
+
4
+ """
5
+ The BareScript data manipulation library
6
+ """
7
+
8
+ import datetime
9
+ import functools
10
+ import importlib
11
+ import statistics
12
+
13
+ from schema_markdown import parse_schema_markdown, validate_type
14
+
15
+ from .parser import parse_expression
16
+ from .value import value_boolean, value_compare, value_json, value_parse_datetime, value_parse_number
17
+
18
+
19
+ # Helper to dynamically import evaluate_expression to avoid the circular dependency
20
+ def _import_evaluate_expression():
21
+ if not _EVALUATE_EXPRESSION:
22
+ _EVALUATE_EXPRESSION.append(importlib.import_module('bare_script.runtime').evaluate_expression)
23
+ return _EVALUATE_EXPRESSION[0]
24
+
25
+ _EVALUATE_EXPRESSION = []
26
+
27
+
28
+ def validate_data(data, csv=False):
29
+ """
30
+ Determine data field types and parse/validate field values
31
+
32
+ :param data: The data array. Row objects are updated with parsed/validated values.
33
+ :type data: list[dict]
34
+ :param csv: If true, parse value strings
35
+ :type csv: bool
36
+ :return: The map of field name to field type ("boolean", "datetime", "number", "string")
37
+ :rtype: dict
38
+ :raises TypeError: Data is invalid
39
+ """
40
+
41
+ # Determine field types
42
+ types = {}
43
+ for row in data:
44
+ for field, value in row.items():
45
+ if types.get(field) is None:
46
+ if isinstance(value, bool):
47
+ types[field] = 'boolean'
48
+ elif isinstance(value, (int, float)):
49
+ types[field] = 'number'
50
+ elif isinstance(value, datetime.date):
51
+ types[field] = 'datetime'
52
+ elif isinstance(value, str):
53
+ # If we aren't parsing CSV strings, its just a string
54
+ if not csv:
55
+ types[field] = 'string'
56
+
57
+ # If its the null string we can't determine the type yet
58
+ elif value in ('', 'null'):
59
+ types[field] = None
60
+
61
+ # Can the string be parsed into another type?
62
+ elif value_parse_datetime(value) is not None:
63
+ types[field] = 'datetime'
64
+ elif value in ('true', 'false'):
65
+ types[field] = 'boolean'
66
+ elif value_parse_number(value) is not None:
67
+ types[field] = 'number'
68
+ else:
69
+ types[field] = 'string'
70
+
71
+ # Set the type for fields with undetermined type
72
+ for field, field_type in types.items():
73
+ if field_type is None:
74
+ types[field] = 'string'
75
+
76
+ # Helper to format and raise validation errors
77
+ def throw_field_error(field, field_type, field_value):
78
+ raise TypeError(f'Invalid "{field}" field value {value_json(field_value)}, expected type {field_type}')
79
+
80
+ # Validate field values
81
+ for row in data:
82
+ for field, value in row.items():
83
+ field_type = types.get(field)
84
+ if field_type is None:
85
+ continue
86
+
87
+ # Null string?
88
+ if csv and value == 'null':
89
+ row[field] = None
90
+
91
+ # Number field
92
+ elif field_type == 'number':
93
+ if csv and isinstance(value, str):
94
+ if value == '':
95
+ number_value = None
96
+ else:
97
+ number_value = value_parse_number(value)
98
+ if number_value is None:
99
+ throw_field_error(field, field_type, value)
100
+ row[field] = number_value
101
+ elif value is not None and not (isinstance(value, (int, float)) and not isinstance(value, bool)):
102
+ throw_field_error(field, field_type, value)
103
+
104
+ # Datetime field
105
+ elif field_type == 'datetime':
106
+ if csv and isinstance(value, str):
107
+ if value == '':
108
+ datetime_value = None
109
+ else:
110
+ datetime_value = value_parse_datetime(value)
111
+ if datetime_value is None:
112
+ throw_field_error(field, field_type, value)
113
+ row[field] = datetime_value
114
+ elif value is not None and not isinstance(value, datetime.date):
115
+ throw_field_error(field, field_type, value)
116
+
117
+ # Boolean field
118
+ elif field_type == 'boolean':
119
+ if csv and isinstance(value, str):
120
+ if value == '':
121
+ boolean_value = None
122
+ else:
123
+ boolean_value = True if value == 'true' else (False if value == 'false' else None)
124
+ if boolean_value is None:
125
+ throw_field_error(field, field_type, value)
126
+ row[field] = boolean_value
127
+ elif value is not None and not isinstance(value, bool):
128
+ throw_field_error(field, field_type, value)
129
+
130
+ # String field
131
+ else:
132
+ if value is not None and not isinstance(value, str):
133
+ throw_field_error(field, field_type, value)
134
+
135
+ return types
136
+
137
+
138
+ def join_data(left_data, right_data, join_expr, right_expr=None, is_left_join=False, variables=None, options=None):
139
+ """
140
+ Join two data arrays
141
+
142
+ :param leftData: The left data array
143
+ :type leftData: list[dict]
144
+ :param rightData: The left data array
145
+ :type rightData: list[dict]
146
+ :param joinExpr: The join `expression <https://craigahobbs.github.io/bare-script-py/language/#expressions>`__
147
+ :type joinExpr: str
148
+ :param rightExpr: The right join `expression <https://craigahobbs.github.io/bare-script-py/language/#expressions>`__
149
+ :type rightExpr: str
150
+ :param isLeftJoin: If true, perform a left join (always include left row)
151
+ :type isLeftJoin: bool
152
+ :param variables: Additional variables for expression evaluation
153
+ :type variables: dict
154
+ :param options: The :class:`script execution options <ExecuteScriptOptions>`
155
+ :type options: dict
156
+ :return: The joined data array
157
+ :rtype: list[dict]
158
+ """
159
+
160
+ evaluate_expression = _import_evaluate_expression()
161
+
162
+ # Compute the map of row field name to joined row field name
163
+ left_names = {}
164
+ right_names_raw = {}
165
+ right_names = {}
166
+ for row in left_data:
167
+ for field_name in row:
168
+ if field_name not in left_names:
169
+ left_names[field_name] = field_name
170
+ for row in right_data:
171
+ for field_name in row:
172
+ if field_name not in right_names_raw:
173
+ right_names_raw[field_name] = field_name
174
+ for field_name in right_names_raw:
175
+ if field_name not in left_names:
176
+ right_names[field_name] = field_name
177
+ else:
178
+ ix_unique = 2
179
+ unique_name = f'{field_name}{ix_unique}'
180
+ while unique_name in left_names or unique_name in right_names or unique_name in right_names_raw:
181
+ ix_unique += 1
182
+ unique_name = f'{field_name}{ix_unique}'
183
+ right_names[field_name] = unique_name
184
+
185
+ # Create the evaluation options object
186
+ eval_options = options
187
+ if variables is not None:
188
+ eval_options = dict(options) if options is not None else {}
189
+ if 'globals' in eval_options:
190
+ eval_options['globals'] = {**eval_options['globals'], **variables}
191
+ else:
192
+ eval_options['globals'] = variables
193
+
194
+ # Parse the left and right expressions
195
+ left_expression = parse_expression(join_expr)
196
+ right_expression = parse_expression(right_expr) if right_expr is not None else left_expression
197
+
198
+ # Bucket the right rows by the right expression value
199
+ right_category_rows = {}
200
+ for right_row in right_data:
201
+ category_key = value_json(evaluate_expression(right_expression, eval_options, right_row))
202
+ if category_key not in right_category_rows:
203
+ right_category_rows[category_key] = []
204
+ right_category_rows[category_key].append(right_row)
205
+
206
+ # Join the left with the right
207
+ data = []
208
+ for left_row in left_data:
209
+ category_key = value_json(evaluate_expression(left_expression, eval_options, left_row))
210
+ if category_key in right_category_rows:
211
+ for right_row in right_category_rows[category_key]:
212
+ join_row = dict(left_row)
213
+ for right_name, right_value in right_row.items():
214
+ join_row[right_names[right_name]] = right_value
215
+ data.append(join_row)
216
+ elif not is_left_join:
217
+ data.append(dict(left_row))
218
+
219
+ return data
220
+
221
+
222
+ def add_calculated_field(data, field_name, expr, variables=None, options=None):
223
+ """
224
+ Add a calculated field to each row of a data array
225
+
226
+ :param data: The data array. Row objects are updated with the calculated field values.
227
+ :type data: list[dict]
228
+ :param fieldName: The calculated field name
229
+ :type fieldName: str
230
+ :param expr: The calculated field expression
231
+ :type expr: str
232
+ :param variables: Additional variables for expression evaluation
233
+ :type variables: dict
234
+ :param options: The :class:`script execution options <ExecuteScriptOptions>`
235
+ :type options: dict
236
+ :return: The updated data array
237
+ :rtype: list[dict]
238
+ """
239
+
240
+ evaluate_expression = _import_evaluate_expression()
241
+
242
+ # Parse the calculation expression
243
+ calc_expr = parse_expression(expr)
244
+
245
+ # Create the evaluation options object
246
+ eval_options = options
247
+ if variables is not None:
248
+ eval_options = dict(options) if options is not None else {}
249
+ if 'globals' in eval_options:
250
+ eval_options['globals'] = {**eval_options['globals'], **variables}
251
+ else:
252
+ eval_options['globals'] = variables
253
+
254
+ # Compute the calculated field for each row
255
+ for row in data:
256
+ row[field_name] = evaluate_expression(calc_expr, eval_options, row)
257
+
258
+ return data
259
+
260
+
261
+ def filter_data(data, expr, variables=None, options=None):
262
+ """
263
+ Filter data rows
264
+
265
+ :param data: The data array
266
+ :type data: list[dict]
267
+ :param expr: The boolean filter `expression <https://craigahobbs.github.io/bare-script-py/language/#expressions>`__
268
+ :type expr: str
269
+ :param variables: Additional variables for expression evaluation
270
+ :type variables: dict
271
+ :param options: The :class:`script execution options <ExecuteScriptOptions>`
272
+ :type options: dict
273
+ :return: The filtered data array
274
+ :rtype: list[dict]
275
+ """
276
+
277
+ result = []
278
+ evaluate_expression = _import_evaluate_expression()
279
+
280
+ # Parse the filter expression
281
+ filter_expr = parse_expression(expr)
282
+
283
+ # Create the evaluation options object
284
+ eval_options = options
285
+ if variables is not None:
286
+ eval_options = dict(options) if options is not None else {}
287
+ if 'globals' in eval_options:
288
+ eval_options['globals'] = {**eval_options['globals'], **variables}
289
+ else:
290
+ eval_options['globals'] = variables
291
+
292
+ # Filter the data
293
+ for row in data:
294
+ if value_boolean(evaluate_expression(filter_expr, eval_options, row)):
295
+ result.append(row)
296
+
297
+ return result
298
+
299
+
300
+ def aggregate_data(data, aggregation):
301
+ """
302
+ Aggregate data rows
303
+
304
+ :param data: The data array
305
+ :type data: list[dict]
306
+ :param aggregation: The `aggregation model <./library/model.html#var.vName='Aggregation'>`__
307
+ :type aggregation: dict
308
+ :return: The aggregated data array
309
+ :rtype: list[dict]
310
+ """
311
+
312
+ # Validate the aggregation model
313
+ validate_type(AGGREGATION_TYPES, 'Aggregation', aggregation)
314
+ categories = aggregation.get('categories')
315
+
316
+ # Create the aggregate rows
317
+ category_rows = {}
318
+ for row in data:
319
+ # Compute the category values
320
+ category_values = [row.get(category) for category in categories] if categories is not None else None
321
+
322
+ # Get or create the aggregate row
323
+ row_key = value_json(category_values) if category_values is not None else ''
324
+ if row_key in category_rows:
325
+ aggregate_row = category_rows[row_key]
326
+ else:
327
+ aggregate_row = {}
328
+ category_rows[row_key] = aggregate_row
329
+ if categories is not None:
330
+ for ix_category_field, category in enumerate(categories):
331
+ aggregate_row[category] = category_values[ix_category_field]
332
+
333
+ # Add to the aggregate measure values
334
+ for measure in aggregation['measures']:
335
+ field = measure.get('name', measure['field'])
336
+ value = row.get(measure['field'])
337
+ if field not in aggregate_row:
338
+ aggregate_row[field] = []
339
+ if value is not None:
340
+ aggregate_row[field].append(value)
341
+
342
+ # Compute the measure values aggregate function value
343
+ aggregate_rows = list(category_rows.values())
344
+ for aggregate_row in aggregate_rows:
345
+ for measure in aggregation['measures']:
346
+ field = measure.get('name', measure['field'])
347
+ func = measure['function']
348
+ measure_values = aggregate_row[field]
349
+ if len(measure_values) == 0:
350
+ aggregate_row[field] = None
351
+ elif func == 'count':
352
+ aggregate_row[field] = len(measure_values)
353
+ elif func == 'max':
354
+ aggregate_row[field] = max(measure_values)
355
+ elif func == 'min':
356
+ aggregate_row[field] = min(measure_values)
357
+ elif func == 'sum':
358
+ aggregate_row[field] = sum(measure_values)
359
+ elif func == 'stddev':
360
+ aggregate_row[field] = statistics.pstdev(measure_values)
361
+ else: # func == 'average'
362
+ aggregate_row[field] = statistics.mean(measure_values)
363
+
364
+ return aggregate_rows
365
+
366
+
367
+ # The aggregation model
368
+ AGGREGATION_TYPES = parse_schema_markdown('''\
369
+ group "Aggregation"
370
+
371
+
372
+ # A data aggregation specification
373
+ struct Aggregation
374
+
375
+ # The aggregation category fields
376
+ optional string[len > 0] categories
377
+
378
+ # The aggregation measures
379
+ AggregationMeasure[len > 0] measures
380
+
381
+
382
+ # An aggregation measure specification
383
+ struct AggregationMeasure
384
+
385
+ # The aggregation measure field
386
+ string field
387
+
388
+ # The aggregation function
389
+ AggregationFunction function
390
+
391
+ # The aggregated-measure field name
392
+ optional string name
393
+
394
+
395
+ # An aggregation function
396
+ enum AggregationFunction
397
+
398
+ # The average of the measure's values
399
+ average
400
+
401
+ # The count of the measure's values
402
+ count
403
+
404
+ # The greatest of the measure's values
405
+ max
406
+
407
+ # The least of the measure's values
408
+ min
409
+
410
+ # The standard deviation of the measure's values
411
+ stddev
412
+
413
+ # The sum of the measure's values
414
+ sum
415
+ ''')
416
+
417
+
418
+ def sort_data(data, sorts):
419
+ """
420
+ Sort data rows
421
+
422
+ :param data: The data array
423
+ :type data: list[dict]
424
+ :param sorts: The sort field-name/descending-sort tuples
425
+ :type sorts: list[list]
426
+ :return: The sorted data array
427
+ :rtype: list[dict]
428
+ """
429
+
430
+ data.sort(key=functools.cmp_to_key(functools.partial(_sort_data_fn, sorts)))
431
+ return data
432
+
433
+
434
+ def _sort_data_fn(sorts, row1, row2):
435
+ for sort in sorts:
436
+ field = sort[0]
437
+ desc = sort[1] if len(sort) > 1 else False
438
+ value1 = row1.get(field)
439
+ value2 = row2.get(field)
440
+ result = value_compare(value2, value1) if desc else value_compare(value1, value2)
441
+ if result != 0:
442
+ return result
443
+ return 0
444
+
445
+
446
+ def top_data(data, count, category_fields=None):
447
+ """
448
+ Top data rows
449
+
450
+ :param data: The data array
451
+ :type data: list[dict]
452
+ :param count: The number of rows to keep
453
+ :type count: int
454
+ :param categoryFields: The category fields
455
+ :type categoryFields: list[str]
456
+ :return: The top data array
457
+ :rtype: list[dict]
458
+ """
459
+
460
+ # Bucket rows by category
461
+ category_rows = {}
462
+ category_order = []
463
+ for row in data:
464
+ category_key = '' if category_fields is None else value_json([row.get(field) for field in category_fields])
465
+ if category_key not in category_rows:
466
+ category_rows[category_key] = []
467
+ category_order.append(category_key)
468
+ category_rows[category_key].append(row)
469
+
470
+ # Take only the top rows
471
+ data_top = []
472
+ for category_key in category_order:
473
+ category_key_rows = category_rows[category_key]
474
+ for ix_row in range(min(count, len(category_key_rows))):
475
+ data_top.append(category_key_rows[ix_row])
476
+
477
+ return data_top
File without changes