thds.tabularasa 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. thds/tabularasa/__init__.py +6 -0
  2. thds/tabularasa/__main__.py +1122 -0
  3. thds/tabularasa/compat.py +33 -0
  4. thds/tabularasa/data_dependencies/__init__.py +0 -0
  5. thds/tabularasa/data_dependencies/adls.py +97 -0
  6. thds/tabularasa/data_dependencies/build.py +573 -0
  7. thds/tabularasa/data_dependencies/sqlite.py +286 -0
  8. thds/tabularasa/data_dependencies/tabular.py +167 -0
  9. thds/tabularasa/data_dependencies/util.py +209 -0
  10. thds/tabularasa/diff/__init__.py +0 -0
  11. thds/tabularasa/diff/data.py +346 -0
  12. thds/tabularasa/diff/schema.py +254 -0
  13. thds/tabularasa/diff/summary.py +249 -0
  14. thds/tabularasa/git_util.py +37 -0
  15. thds/tabularasa/loaders/__init__.py +0 -0
  16. thds/tabularasa/loaders/lazy_adls.py +44 -0
  17. thds/tabularasa/loaders/parquet_util.py +385 -0
  18. thds/tabularasa/loaders/sqlite_util.py +346 -0
  19. thds/tabularasa/loaders/util.py +532 -0
  20. thds/tabularasa/py.typed +0 -0
  21. thds/tabularasa/schema/__init__.py +7 -0
  22. thds/tabularasa/schema/compilation/__init__.py +20 -0
  23. thds/tabularasa/schema/compilation/_format.py +50 -0
  24. thds/tabularasa/schema/compilation/attrs.py +257 -0
  25. thds/tabularasa/schema/compilation/attrs_sqlite.py +278 -0
  26. thds/tabularasa/schema/compilation/io.py +96 -0
  27. thds/tabularasa/schema/compilation/pandas.py +252 -0
  28. thds/tabularasa/schema/compilation/pyarrow.py +93 -0
  29. thds/tabularasa/schema/compilation/sphinx.py +550 -0
  30. thds/tabularasa/schema/compilation/sqlite.py +69 -0
  31. thds/tabularasa/schema/compilation/util.py +117 -0
  32. thds/tabularasa/schema/constraints.py +327 -0
  33. thds/tabularasa/schema/dtypes.py +153 -0
  34. thds/tabularasa/schema/extract_from_parquet.py +132 -0
  35. thds/tabularasa/schema/files.py +215 -0
  36. thds/tabularasa/schema/metaschema.py +1007 -0
  37. thds/tabularasa/schema/util.py +123 -0
  38. thds/tabularasa/schema/validation.py +878 -0
  39. thds/tabularasa/sqlite3_compat.py +41 -0
  40. thds/tabularasa/sqlite_from_parquet.py +34 -0
  41. thds/tabularasa/to_sqlite.py +56 -0
  42. thds_tabularasa-0.13.0.dist-info/METADATA +530 -0
  43. thds_tabularasa-0.13.0.dist-info/RECORD +46 -0
  44. thds_tabularasa-0.13.0.dist-info/WHEEL +5 -0
  45. thds_tabularasa-0.13.0.dist-info/entry_points.txt +2 -0
  46. thds_tabularasa-0.13.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,573 @@
1
+ import json
2
+ from logging import getLogger
3
+ from pathlib import Path
4
+ from typing import (
5
+ Callable,
6
+ Collection,
7
+ Dict,
8
+ Iterable,
9
+ List,
10
+ Mapping,
11
+ Optional,
12
+ Tuple,
13
+ Type,
14
+ Union,
15
+ cast,
16
+ )
17
+
18
+ import networkx as nx
19
+ import pandas as pd
20
+ import pkg_resources
21
+ import pyarrow
22
+ import pyarrow.parquet
23
+ import setuptools.command.build_py
24
+
25
+ from thds.tabularasa.loaders.util import PandasParquetLoader, default_parquet_package_data_path
26
+ from thds.tabularasa.schema import load_schema
27
+ from thds.tabularasa.schema.compilation import (
28
+ render_attrs_module,
29
+ render_attrs_sqlite_schema,
30
+ render_pandera_module,
31
+ render_pyarrow_schema,
32
+ render_sql_schema,
33
+ write_if_ast_changed,
34
+ write_sql,
35
+ )
36
+ from thds.tabularasa.schema.files import LocalDataSpec, TabularFileSource
37
+ from thds.tabularasa.schema.metaschema import (
38
+ ADLSRef,
39
+ BuildOptions,
40
+ LocalRef,
41
+ RawDataDependencies,
42
+ ReferenceDataRef,
43
+ Schema,
44
+ Table,
45
+ TabularTextFileRef,
46
+ TransientReferenceDataRef,
47
+ is_build_time_package_table,
48
+ )
49
+ from thds.tabularasa.schema.util import predecessor_graph
50
+
51
+ from .adls import ADLSDownloadResult, sync_adls_data
52
+ from .sqlite import populate_sqlite_db
53
+ from .tabular import PandasCSVLoader
54
+ from .util import (
55
+ PARQUET_FORMAT_VERSION,
56
+ arrow_table_for_parquet_write,
57
+ hash_file,
58
+ import_data_preprocessor,
59
+ package_data_file_size,
60
+ )
61
+
62
+ ResourceRef = Union[ADLSRef, ReferenceDataRef, LocalRef]
63
+ # do-nothing computational DAG nodes - we exclude these from the compute order for better visibility on
64
+ # the number of meaningful build steps
65
+ NoOpRefTypes = (TabularTextFileRef, LocalRef)
66
+
67
+ METADATA_FILE = "meta.json"
68
+
69
+
70
+ class ReferenceDataBuildCommand(setuptools.command.build_py.build_py):
71
+ """Use in your setup.py as follows:
72
+
73
+ .. code-block:: python
74
+
75
+ from setuptools import setup
76
+
77
+ my_build_cmd = ReferenceDataBuildCommand.with_options(
78
+ package_name="my_package",
79
+ schema_path="path/inside/my_package/to/my/schema.yaml",
80
+ )
81
+
82
+ setup(
83
+ ...
84
+ cmdclass={"build_py": my_build_cmd}
85
+ ...
86
+ )
87
+ """
88
+
89
+ package_name: str
90
+ schema_path: str
91
+ for_setup_py_build: bool
92
+ schema: Schema
93
+
94
+ @classmethod
95
+ def with_options(
96
+ cls,
97
+ *,
98
+ package_name: str,
99
+ schema_path: str,
100
+ for_setup_py_build: bool = True,
101
+ ) -> Type["ReferenceDataBuildCommand"]:
102
+ """Supply parameters specifying a reference data build for a specific package
103
+
104
+ :param package_name: name of the package where the data is to be defined and stored
105
+ :param schema_path: path to the schema relative to the package root; should be a YAML file
106
+ compatible with the structure of `thds.tabularasa.schema.metaschema.Schema`
107
+ :param for_setup_py_build: if `True` (the default), this indicates that this class is being used
108
+ for packaging/building in the context of a setuptools build. This will cause some steps to
109
+ execute that may not be wanted in other workflows, in which case it should be set to `False`
110
+ :return: a `ReferenceDataBuildCommand` subclass with the provided fields populated
111
+ """
112
+ namespace = locals()
113
+ namespace.pop("cls")
114
+ return type(
115
+ cls.__name__,
116
+ (cls,),
117
+ namespace,
118
+ ) # type: ignore
119
+
120
+ @property
121
+ def options(self) -> BuildOptions:
122
+ assert self.schema.build_options is not None
123
+ return self.schema.build_options
124
+
125
+ def __init__(self, *args, **kwargs):
126
+ self.schema = load_schema(
127
+ package=self.package_name,
128
+ schema_path=self.schema_path,
129
+ require_preprocessors=self.for_setup_py_build,
130
+ require_data_resources=self.for_setup_py_build,
131
+ )
132
+ self.derived_code_submodule_dir: Path = Path(
133
+ pkg_resources.resource_filename(
134
+ self.package_name,
135
+ self.options.derived_code_submodule.replace(".", "/"),
136
+ )
137
+ )
138
+ assert self.schema.build_options is not None, "Can't build without build_options being specified"
139
+
140
+ if self.for_setup_py_build:
141
+ super().__init__(*args, **kwargs)
142
+
143
+ def run(self):
144
+ self.write_derived_source_code()
145
+ super().run()
146
+
147
+ def build_package_data(self, check_hash: bool = True, tables: Optional[Collection[str]] = None):
148
+ # derive and write package data
149
+ package_data_paths: List[str] = [self.schema_path]
150
+
151
+ if tables is None:
152
+ build_kw = {}
153
+ else:
154
+ unknown_tables = {t for t in tables if t not in self.schema.tables}
155
+ if unknown_tables:
156
+ raise KeyError(f"Unknown tables: {','.join(unknown_tables)}")
157
+ build_kw = dict(table_predicate=lambda table: table.name in tables) # type: ignore
158
+
159
+ if any(
160
+ table.build_time_installed and not table.transient for table in self.schema.tables.values()
161
+ ):
162
+ assert (
163
+ self.options.package_data_dir is not None and self.options.transient_data_dir is not None
164
+ ), "package_data_dir or transient_data_dir specified; can't write tables"
165
+ package_data_table_paths, _ = write_package_data_tables(
166
+ self.schema,
167
+ package=self.package_name,
168
+ output_data_dir=self.options.package_data_dir,
169
+ transient_data_dir=self.options.transient_data_dir,
170
+ check_hash=check_hash,
171
+ validate_transient_tables=self.options.validate_transient_tables,
172
+ **build_kw,
173
+ )
174
+ package_data_paths.extend(
175
+ get_data_files_to_package(
176
+ self.package_name,
177
+ package_data_table_paths,
178
+ size_limit=self.options.package_data_file_size_limit,
179
+ )
180
+ )
181
+
182
+ if self.options.sqlite_data:
183
+ # now initialize database and load reference data into database
184
+ assert (
185
+ self.options.sqlite_db_path is not None
186
+ ), "No sqlite_db_path specified; can't populate db"
187
+ assert (
188
+ self.options.package_data_dir is not None
189
+ ), "No package_data_dir specified; can't populate db"
190
+ assert (
191
+ self.options.transient_data_dir is not None
192
+ ), "No transient_data_dir specified; can't populate db"
193
+ populate_sqlite_db(
194
+ self.schema,
195
+ db_package=self.package_name,
196
+ db_path=self.options.sqlite_db_path,
197
+ data_package=self.package_name,
198
+ data_dir=self.options.package_data_dir,
199
+ transient_data_dir=self.options.transient_data_dir,
200
+ check_hash=check_hash,
201
+ **build_kw, # type: ignore
202
+ )
203
+ package_data_paths.append(self.options.sqlite_db_path)
204
+
205
+ package_data = {
206
+ "": [METADATA_FILE, "py.typed"],
207
+ self.package_name: package_data_paths,
208
+ }
209
+ if hasattr(self, "package_data") and self.package_data is not None:
210
+ self.package_data = {
211
+ key: self.package_data.get(key, []) + package_data.get(key, [])
212
+ for key in set(self.package_data).union(package_data)
213
+ }
214
+ else:
215
+ self.package_data = package_data
216
+
217
+ getLogger(__name__).info(f"package_data set to:\n{json.dumps(self.package_data, indent=4)}")
218
+
219
+ def write_derived_source_code(self):
220
+ # attrs classes needed for sqlite interface
221
+ if self.options.attrs or self.options.sqlite_interface:
222
+ attrs_source = render_attrs_module(
223
+ self.schema,
224
+ package=self.package_name,
225
+ )
226
+ write_if_ast_changed(attrs_source, self.derived_code_submodule_dir / "attrs.py")
227
+ if self.options.pandas:
228
+ pandas_source = render_pandera_module(
229
+ self.schema,
230
+ package=self.package_name,
231
+ )
232
+ write_if_ast_changed(pandas_source, self.derived_code_submodule_dir / "pandas.py")
233
+ if self.options.pyarrow:
234
+ pyarrow_source = render_pyarrow_schema(self.schema)
235
+ write_if_ast_changed(pyarrow_source, self.derived_code_submodule_dir / "pyarrow.py")
236
+ if self.options.sqlite_interface:
237
+ attrs_sqlite_source = render_attrs_sqlite_schema(
238
+ self.schema,
239
+ package=self.package_name,
240
+ db_path=self.options.sqlite_db_path or "",
241
+ )
242
+ write_if_ast_changed(
243
+ attrs_sqlite_source, self.derived_code_submodule_dir / "attrs_sqlite.py"
244
+ )
245
+ sql_table_source, sql_index_source = render_sql_schema(self.schema)
246
+ write_sql(sql_table_source, self.derived_code_submodule_dir / "table.sql")
247
+ write_sql(sql_index_source, self.derived_code_submodule_dir / "index.sql")
248
+
249
+
250
+ def write_package_data_tables(
251
+ schema: Schema,
252
+ output_data_dir: str,
253
+ transient_data_dir: str,
254
+ package: str,
255
+ check_hash: bool = True,
256
+ table_predicate: Callable[[Table], bool] = is_build_time_package_table,
257
+ validate_transient_tables: bool = False,
258
+ ) -> Tuple[List[str], List[str]]:
259
+ """This is the main routine for building all derived package data.
260
+
261
+ The main steps in this process are:
262
+ 1) compute the computational DAG represented by the dependencies in the schema
263
+ 2) determine which of the reference table nodes in the DAG have already been computed and can be
264
+ loaded from disk, by using a file existence check and optionally a hash check (this saves time
265
+ downloading ADLS resources and computing derived data in local builds). Dependency links are
266
+ removed for local reference table nodes which have been precomputed, since these can be simply
267
+ loaded from their local package data files. Any nodes remaining in the DAG with no upstream
268
+ dependencies and no downstream dependents remaining to be computed are finally removed
269
+ from the DAG.
270
+ 3) traverse the DAG in topological order and compute the resources associated with the nodes. For
271
+ ADLS resources, this means fetching the remote files from ADLS to a local build cache. For derived
272
+ package data tables, it means
273
+ a) importing and calling the associated preprocessor function on the upstream ADLS, local file,
274
+ and reference table dependencies, when the dependencies are a `RawDataDependencies` instance
275
+ b) loading the table from a tabular text file, when the dependencies are a `TabularFileSource`
276
+ instance
277
+ In that case, the resulting derived table is saved as a strictly-typed parquet file in
278
+ `output_data_dir` as package data for `package`.
279
+
280
+ For remote builds, there will be no cache populated and so the pruning in step 2 has no effect. For
281
+ local builds, the speed of development will benefit from the local cache being populated on the first
282
+ build.
283
+
284
+ The table_predicate argument can be used to filter only a subset of tables for computation and
285
+ packaging. By default, all tables marked as build-time-installed and not transient (and all of their
286
+ recursive dependencies) are computed.
287
+
288
+ :return: 2 lists of *package data* paths to the derived table parquet files, one for tables packaged
289
+ with interfaces and another for transient tables. These can be used e.g. to specify package data
290
+ paths in a build.
291
+ """
292
+ _LOGGER = getLogger(__name__)
293
+ _LOGGER.info("Computing derived reference data tables")
294
+ compute_order, precomputed_tables = _computation_order_and_dependencies(
295
+ schema,
296
+ package=package,
297
+ output_data_dir=output_data_dir,
298
+ transient_data_dir=transient_data_dir,
299
+ check_hash=check_hash,
300
+ table_predicate=table_predicate,
301
+ )
302
+
303
+ # optimization for DIY garbage collection as prior nodes are no longer needed for future nodes;
304
+ # we can remove a computed table from the cache as soon as we pass its last required index
305
+ last_indexes = {}
306
+ for ix, (ref, deps) in enumerate(compute_order):
307
+ last_indexes[ref] = ix
308
+ for dep in deps:
309
+ last_indexes[dep] = ix
310
+
311
+ # store intermediate results here
312
+ adls_cache: Dict[str, List[ADLSDownloadResult]] = {}
313
+ ref_cache: Dict[str, pd.DataFrame] = {}
314
+
315
+ if len(compute_order):
316
+ _LOGGER.info("Traversing data dependency DAG and computing tables")
317
+
318
+ # keep track of paths to tables that have already been computed
319
+ package_data_paths: List[str] = []
320
+ transient_data_paths: List[str] = []
321
+ for table_ref, path in precomputed_tables.items():
322
+ if isinstance(table_ref, TransientReferenceDataRef):
323
+ transient_data_paths.append(path)
324
+ else:
325
+ package_data_paths.append(path)
326
+
327
+ # finally loop over package data resources that need computing and fetch/compute them
328
+ for ix, (ref, deps) in enumerate(compute_order):
329
+ if isinstance(ref, NoOpRefTypes):
330
+ # shouldn't happen because we filter them in determining the compute order -
331
+ # just for completeness
332
+ continue
333
+ elif isinstance(ref, ADLSRef):
334
+ # download ADLS files
335
+ _LOGGER.info(f"Syncing ADLS resource {ref}")
336
+ adls_cache[str(ref)] = sync_adls_data(schema.remote_data[str(ref)])
337
+ elif isinstance(ref, ReferenceDataRef):
338
+ table = schema.tables[str(ref)]
339
+ if table.transient:
340
+ data_dir = transient_data_dir
341
+ paths = transient_data_paths
342
+ else:
343
+ data_dir = output_data_dir
344
+ paths = transient_data_paths
345
+
346
+ if ref not in precomputed_tables:
347
+ # compute package data table from dependencies
348
+ df = _compute_dependent_table(table, ref_cache, adls_cache, schema.local_data)
349
+ _LOGGER.info("Saving newly computed table %s", ref)
350
+ _save_as_package_data(df, table, package, data_dir)
351
+ else:
352
+ df = None
353
+
354
+ package_data_path = default_parquet_package_data_path(table.name, data_dir)
355
+ paths.append(package_data_path)
356
+
357
+ # garbage collection
358
+ for dep in deps:
359
+ if isinstance(dep, ReferenceDataRef) and last_indexes[dep] <= ix:
360
+ _LOGGER.info(
361
+ "Collecting table %s which is not needed in any downstream build step",
362
+ dep,
363
+ )
364
+ del ref_cache[str(dep)]
365
+
366
+ if last_indexes[ref] <= ix:
367
+ if df is not None:
368
+ _LOGGER.info(
369
+ "Collecting table %s which is not needed in any downstream build step",
370
+ ref,
371
+ )
372
+ del df
373
+ else:
374
+ # load in from disk for downstream computations - loading from disk ensures exactly the
375
+ # same dataframe whether the above block was run or not
376
+ _LOGGER.info(
377
+ "Loading table %s from disk for use in next %d build steps",
378
+ table.name,
379
+ last_indexes[ref] - ix,
380
+ )
381
+ validate = validate_transient_tables and table.transient
382
+ pandas_loader = PandasParquetLoader.from_schema_table(
383
+ table, package=package, data_dir=data_dir, derive_schema=validate
384
+ )
385
+ df = pandas_loader(validate=validate)
386
+ ref_cache[str(ref)] = df
387
+
388
+ return package_data_paths, transient_data_paths
389
+
390
+
391
+ def _compute_dependent_table(
392
+ table: Table,
393
+ ref_cache: Mapping[str, pd.DataFrame],
394
+ adls_cache: Mapping[str, List[ADLSDownloadResult]],
395
+ local_cache: Mapping[str, LocalDataSpec],
396
+ ) -> pd.DataFrame:
397
+ _LOGGER = getLogger(__name__)
398
+ if isinstance(table.dependencies, TabularFileSource):
399
+ pandas_loader = PandasCSVLoader(table)
400
+ _LOGGER.info(
401
+ "Translating tabular text file at %s to parquet for table %s",
402
+ table.dependencies.filename,
403
+ table.name,
404
+ )
405
+ df = pandas_loader(validate=False)
406
+ elif isinstance(table.dependencies, RawDataDependencies):
407
+ ref_deps = table.dependencies.reference
408
+ adls_deps = table.dependencies.adls
409
+ local_deps = table.dependencies.local
410
+ _LOGGER.info(
411
+ "Computing table %s from reference dependencies [%s] local dependencies [%s] and ADLS "
412
+ "dependencies [%s]",
413
+ table.name,
414
+ ", ".join(ref_deps),
415
+ ", ".join(local_deps),
416
+ ", ".join(adls_deps),
417
+ )
418
+ preprocessor = import_data_preprocessor(table.dependencies.preprocessor)
419
+ df = preprocessor(
420
+ {dep: ref_cache[dep] for dep in ref_deps},
421
+ {dep: adls_cache[dep] for dep in adls_deps},
422
+ {dep: local_cache[dep] for dep in local_deps},
423
+ )
424
+ else:
425
+ raise ValueError(f"Can't compute table {table.name}: no dependencies defined")
426
+
427
+ return df
428
+
429
+
430
+ def _save_as_package_data(
431
+ df: pd.DataFrame,
432
+ table: Table,
433
+ package_name: str,
434
+ data_dir: str,
435
+ ) -> Path:
436
+ """NOTE: This function mutates `df` but is only ever called in one place in
437
+ `write_package_data_tables`, just before the reference to `df` is collected."""
438
+ file_path = Path(
439
+ pkg_resources.resource_filename(
440
+ package_name, default_parquet_package_data_path(table.name, data_dir)
441
+ )
442
+ )
443
+ getLogger(__name__).info("Writing table %s to %s", table.name, file_path)
444
+ file_path.parent.mkdir(parents=True, exist_ok=True)
445
+
446
+ # cast complex types (e.g. dicts) to types that pyarrow can interpret for writing to parquet
447
+ # cast some other compatible dtypes or warn if it can't be done safely
448
+ # reset index and sort by index columns
449
+ # ensure exact parquet schema by using pyarrow
450
+ arrow = arrow_table_for_parquet_write(df, table)
451
+ pyarrow.parquet.write_table(arrow, file_path, compression="snappy", version=PARQUET_FORMAT_VERSION)
452
+ return file_path
453
+
454
+
455
+ def _computation_order_and_dependencies(
456
+ schema: Schema,
457
+ package: str,
458
+ output_data_dir: str,
459
+ transient_data_dir: str,
460
+ table_predicate: Callable[[Table], bool],
461
+ check_hash: bool = True,
462
+ ) -> Tuple[List[Tuple[ResourceRef, List[ResourceRef]]], Dict[ReferenceDataRef, str]]:
463
+ _LOGGER = getLogger(__name__)
464
+ # guaranteed to be a DAG by load-time validation
465
+ dag = schema.dependency_dag(table_predicate)
466
+
467
+ precomputed_tables: Dict[ReferenceDataRef, str] = dict()
468
+
469
+ if check_hash:
470
+ _LOGGER.info("Checking hashes of existing derived tables")
471
+
472
+ # determine dependent tables that have already been computed by hash
473
+ for table in schema.filter_tables(lambda t: t.graph_ref in dag):
474
+ derived_pqt_md5 = table.md5
475
+ pqt_package_data_path = default_parquet_package_data_path(
476
+ table.name,
477
+ data_dir=transient_data_dir if table.transient else output_data_dir,
478
+ )
479
+
480
+ if check_hash:
481
+ if derived_pqt_md5 is not None and pkg_resources.resource_exists(
482
+ package,
483
+ pqt_package_data_path,
484
+ ):
485
+ with pkg_resources.resource_stream(package, pqt_package_data_path) as f:
486
+ if hash_file(f) == derived_pqt_md5:
487
+ precomputed_tables[table.graph_ref] = pqt_package_data_path
488
+ else:
489
+ _LOGGER.warning(
490
+ "MD5 of file %s in package %s for table %s doesn't match expected value; "
491
+ "cannot safely skip computation",
492
+ pqt_package_data_path,
493
+ package,
494
+ table.name,
495
+ )
496
+ elif derived_pqt_md5 is None:
497
+ _LOGGER.warning(
498
+ "No MD5 hash defined for table %s; it will be re-computed on every build; add a hash"
499
+ " of the generated file %s in the %s package to the dependencies block of the "
500
+ "schema to prevent this",
501
+ table.name,
502
+ pqt_package_data_path,
503
+ package,
504
+ )
505
+ elif pkg_resources.resource_exists(package, pqt_package_data_path):
506
+ if derived_pqt_md5 is not None:
507
+ _LOGGER.warning(
508
+ "Ignoring MD5 hash for table %s since check_hash=False was passed; its associated "
509
+ "package data exists at %s and will not be regenerated regardless of its hash",
510
+ table.name,
511
+ pqt_package_data_path,
512
+ )
513
+ precomputed_tables[table.graph_ref] = pqt_package_data_path
514
+
515
+ # we don't need to compute dependencies for tables that have been computed - they can be loaded
516
+ # from disk
517
+ for table_ref in precomputed_tables:
518
+ _LOGGER.info(
519
+ f"{table_ref!r} is pre-computed and can be loaded from package data; removing dependency "
520
+ f"links"
521
+ )
522
+ # don't need to compute dependencies for this table; can load from disk
523
+ for upstream in list(dag.predecessors(table_ref)):
524
+ dag.remove_edge(upstream, table_ref)
525
+
526
+ # anything not required by our intended tables can be removed
527
+ requested_tables = set(
528
+ table.graph_ref for table in schema.filter_tables(table_predicate)
529
+ ).difference(precomputed_tables)
530
+ filtered_dag = predecessor_graph(dag, requested_tables)
531
+ for ref in set(dag).difference(filtered_dag):
532
+ _LOGGER.info(
533
+ f"Safely skipping computation of {ref!r}; no downstream dependencies remaining to compute"
534
+ )
535
+ dag.remove_node(ref)
536
+
537
+ def is_build_step(ref):
538
+ if isinstance(ref, NoOpRefTypes):
539
+ return False
540
+ if isinstance(ref, ReferenceDataRef):
541
+ return ref not in precomputed_tables or any(filtered_dag.successors(ref))
542
+ return True
543
+
544
+ load_order = [
545
+ (ref, list(filtered_dag.predecessors(ref)))
546
+ for ref in filter(is_build_step, nx.topological_sort(filtered_dag))
547
+ ]
548
+ _LOGGER.info(f"Final build stage order: {[ref for ref, deps in load_order]}")
549
+ return load_order, precomputed_tables
550
+
551
+
552
+ def get_data_files_to_package(
553
+ package: str,
554
+ package_data_paths: Iterable[str],
555
+ size_limit: Optional[int],
556
+ ) -> Iterable[str]:
557
+ _LOGGER = getLogger(__name__)
558
+ if size_limit is None:
559
+ _LOGGER.info("Packaging all data files since no size limit is specified")
560
+ yield from package_data_paths
561
+ else:
562
+ size_limit_ = cast(int, size_limit) # mypy needs this for some weird reason
563
+
564
+ def size_filter(package_data_path: str, size_limit: int = size_limit_) -> bool:
565
+ if package_data_file_size(package, package_data_path) > size_limit:
566
+ _LOGGER.info(
567
+ f"Filtering out {package_data_path}. File is too large to package "
568
+ "but will be stored remote blob store"
569
+ )
570
+ return False
571
+ return True
572
+
573
+ yield from filter(size_filter, package_data_paths)