dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. build_backend.py +93 -0
  2. dsgrid/__init__.py +22 -0
  3. dsgrid/api/__init__.py +0 -0
  4. dsgrid/api/api_manager.py +179 -0
  5. dsgrid/api/app.py +419 -0
  6. dsgrid/api/models.py +60 -0
  7. dsgrid/api/response_models.py +116 -0
  8. dsgrid/apps/__init__.py +0 -0
  9. dsgrid/apps/project_viewer/app.py +216 -0
  10. dsgrid/apps/registration_gui.py +444 -0
  11. dsgrid/chronify.py +32 -0
  12. dsgrid/cli/__init__.py +0 -0
  13. dsgrid/cli/common.py +120 -0
  14. dsgrid/cli/config.py +176 -0
  15. dsgrid/cli/download.py +13 -0
  16. dsgrid/cli/dsgrid.py +157 -0
  17. dsgrid/cli/dsgrid_admin.py +92 -0
  18. dsgrid/cli/install_notebooks.py +62 -0
  19. dsgrid/cli/query.py +729 -0
  20. dsgrid/cli/registry.py +1862 -0
  21. dsgrid/cloud/__init__.py +0 -0
  22. dsgrid/cloud/cloud_storage_interface.py +140 -0
  23. dsgrid/cloud/factory.py +31 -0
  24. dsgrid/cloud/fake_storage_interface.py +37 -0
  25. dsgrid/cloud/s3_storage_interface.py +156 -0
  26. dsgrid/common.py +36 -0
  27. dsgrid/config/__init__.py +0 -0
  28. dsgrid/config/annual_time_dimension_config.py +194 -0
  29. dsgrid/config/common.py +142 -0
  30. dsgrid/config/config_base.py +148 -0
  31. dsgrid/config/dataset_config.py +907 -0
  32. dsgrid/config/dataset_schema_handler_factory.py +46 -0
  33. dsgrid/config/date_time_dimension_config.py +136 -0
  34. dsgrid/config/dimension_config.py +54 -0
  35. dsgrid/config/dimension_config_factory.py +65 -0
  36. dsgrid/config/dimension_mapping_base.py +350 -0
  37. dsgrid/config/dimension_mappings_config.py +48 -0
  38. dsgrid/config/dimensions.py +1025 -0
  39. dsgrid/config/dimensions_config.py +71 -0
  40. dsgrid/config/file_schema.py +190 -0
  41. dsgrid/config/index_time_dimension_config.py +80 -0
  42. dsgrid/config/input_dataset_requirements.py +31 -0
  43. dsgrid/config/mapping_tables.py +209 -0
  44. dsgrid/config/noop_time_dimension_config.py +42 -0
  45. dsgrid/config/project_config.py +1462 -0
  46. dsgrid/config/registration_models.py +188 -0
  47. dsgrid/config/representative_period_time_dimension_config.py +194 -0
  48. dsgrid/config/simple_models.py +49 -0
  49. dsgrid/config/supplemental_dimension.py +29 -0
  50. dsgrid/config/time_dimension_base_config.py +192 -0
  51. dsgrid/data_models.py +155 -0
  52. dsgrid/dataset/__init__.py +0 -0
  53. dsgrid/dataset/dataset.py +123 -0
  54. dsgrid/dataset/dataset_expression_handler.py +86 -0
  55. dsgrid/dataset/dataset_mapping_manager.py +121 -0
  56. dsgrid/dataset/dataset_schema_handler_base.py +945 -0
  57. dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
  58. dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
  59. dsgrid/dataset/growth_rates.py +162 -0
  60. dsgrid/dataset/models.py +51 -0
  61. dsgrid/dataset/table_format_handler_base.py +257 -0
  62. dsgrid/dataset/table_format_handler_factory.py +17 -0
  63. dsgrid/dataset/unpivoted_table.py +121 -0
  64. dsgrid/dimension/__init__.py +0 -0
  65. dsgrid/dimension/base_models.py +230 -0
  66. dsgrid/dimension/dimension_filters.py +308 -0
  67. dsgrid/dimension/standard.py +252 -0
  68. dsgrid/dimension/time.py +352 -0
  69. dsgrid/dimension/time_utils.py +103 -0
  70. dsgrid/dsgrid_rc.py +88 -0
  71. dsgrid/exceptions.py +105 -0
  72. dsgrid/filesystem/__init__.py +0 -0
  73. dsgrid/filesystem/cloud_filesystem.py +32 -0
  74. dsgrid/filesystem/factory.py +32 -0
  75. dsgrid/filesystem/filesystem_interface.py +136 -0
  76. dsgrid/filesystem/local_filesystem.py +74 -0
  77. dsgrid/filesystem/s3_filesystem.py +118 -0
  78. dsgrid/loggers.py +132 -0
  79. dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
  80. dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
  81. dsgrid/notebooks/registration.ipynb +48 -0
  82. dsgrid/notebooks/start_notebook.sh +11 -0
  83. dsgrid/project.py +451 -0
  84. dsgrid/query/__init__.py +0 -0
  85. dsgrid/query/dataset_mapping_plan.py +142 -0
  86. dsgrid/query/derived_dataset.py +388 -0
  87. dsgrid/query/models.py +728 -0
  88. dsgrid/query/query_context.py +287 -0
  89. dsgrid/query/query_submitter.py +994 -0
  90. dsgrid/query/report_factory.py +19 -0
  91. dsgrid/query/report_peak_load.py +70 -0
  92. dsgrid/query/reports_base.py +20 -0
  93. dsgrid/registry/__init__.py +0 -0
  94. dsgrid/registry/bulk_register.py +165 -0
  95. dsgrid/registry/common.py +287 -0
  96. dsgrid/registry/config_update_checker_base.py +63 -0
  97. dsgrid/registry/data_store_factory.py +34 -0
  98. dsgrid/registry/data_store_interface.py +74 -0
  99. dsgrid/registry/dataset_config_generator.py +158 -0
  100. dsgrid/registry/dataset_registry_manager.py +950 -0
  101. dsgrid/registry/dataset_update_checker.py +16 -0
  102. dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
  103. dsgrid/registry/dimension_mapping_update_checker.py +16 -0
  104. dsgrid/registry/dimension_registry_manager.py +413 -0
  105. dsgrid/registry/dimension_update_checker.py +16 -0
  106. dsgrid/registry/duckdb_data_store.py +207 -0
  107. dsgrid/registry/filesystem_data_store.py +150 -0
  108. dsgrid/registry/filter_registry_manager.py +123 -0
  109. dsgrid/registry/project_config_generator.py +57 -0
  110. dsgrid/registry/project_registry_manager.py +1623 -0
  111. dsgrid/registry/project_update_checker.py +48 -0
  112. dsgrid/registry/registration_context.py +223 -0
  113. dsgrid/registry/registry_auto_updater.py +316 -0
  114. dsgrid/registry/registry_database.py +667 -0
  115. dsgrid/registry/registry_interface.py +446 -0
  116. dsgrid/registry/registry_manager.py +558 -0
  117. dsgrid/registry/registry_manager_base.py +367 -0
  118. dsgrid/registry/versioning.py +92 -0
  119. dsgrid/rust_ext/__init__.py +14 -0
  120. dsgrid/rust_ext/find_minimal_patterns.py +129 -0
  121. dsgrid/spark/__init__.py +0 -0
  122. dsgrid/spark/functions.py +589 -0
  123. dsgrid/spark/types.py +110 -0
  124. dsgrid/tests/__init__.py +0 -0
  125. dsgrid/tests/common.py +140 -0
  126. dsgrid/tests/make_us_data_registry.py +265 -0
  127. dsgrid/tests/register_derived_datasets.py +103 -0
  128. dsgrid/tests/utils.py +25 -0
  129. dsgrid/time/__init__.py +0 -0
  130. dsgrid/time/time_conversions.py +80 -0
  131. dsgrid/time/types.py +67 -0
  132. dsgrid/units/__init__.py +0 -0
  133. dsgrid/units/constants.py +113 -0
  134. dsgrid/units/convert.py +71 -0
  135. dsgrid/units/energy.py +145 -0
  136. dsgrid/units/power.py +87 -0
  137. dsgrid/utils/__init__.py +0 -0
  138. dsgrid/utils/dataset.py +830 -0
  139. dsgrid/utils/files.py +179 -0
  140. dsgrid/utils/filters.py +125 -0
  141. dsgrid/utils/id_remappings.py +100 -0
  142. dsgrid/utils/py_expression_eval/LICENSE +19 -0
  143. dsgrid/utils/py_expression_eval/README.md +8 -0
  144. dsgrid/utils/py_expression_eval/__init__.py +847 -0
  145. dsgrid/utils/py_expression_eval/tests.py +283 -0
  146. dsgrid/utils/run_command.py +70 -0
  147. dsgrid/utils/scratch_dir_context.py +65 -0
  148. dsgrid/utils/spark.py +918 -0
  149. dsgrid/utils/spark_partition.py +98 -0
  150. dsgrid/utils/timing.py +239 -0
  151. dsgrid/utils/utilities.py +221 -0
  152. dsgrid/utils/versioning.py +36 -0
  153. dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
  154. dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
  155. dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
  156. dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
  157. dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
@@ -0,0 +1,367 @@
1
+ """Base class for all registry managers."""
2
+
3
+ import abc
4
+ import copy
5
+ import logging
6
+ from pathlib import Path
7
+ from typing import Any, Self, Type
8
+
9
+ from semver import VersionInfo
10
+ from sqlalchemy import Connection
11
+
12
+ from dsgrid.config.config_base import ConfigBase
13
+ from dsgrid.exceptions import (
14
+ DSGInvalidParameter,
15
+ DSGValueNotRegistered,
16
+ DSGDuplicateValueRegistered,
17
+ )
18
+ from dsgrid.registry.registration_context import RegistrationContext
19
+ from dsgrid.registry.registry_interface import RegistryInterfaceBase
20
+ from dsgrid.registry.common import RegistryManagerParams, VersionUpdateType
21
+
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class RegistryManagerBase(abc.ABC):
27
+ """Base class for all registry managers."""
28
+
29
+ def __init__(self, path, params: RegistryManagerParams):
30
+ self._path = path
31
+ self._params = params
32
+ self._db = None
33
+
34
+ if not path.exists():
35
+ logger.warning(
36
+ "The registry data path=%s does not exist. You will able to inspect the registry "
37
+ "contents, but you will not be able to perform any data-related activities.",
38
+ path,
39
+ )
40
+
41
+ @property
42
+ @abc.abstractmethod
43
+ def db(self) -> RegistryInterfaceBase:
44
+ """Return the database interface."""
45
+
46
+ @db.setter
47
+ @abc.abstractmethod
48
+ def db(self, db: RegistryInterfaceBase) -> None:
49
+ """Return the database interface."""
50
+
51
+ @classmethod
52
+ def load(cls, path, params, db, *args: Any, **kwargs: Any) -> Self:
53
+ """Load the registry manager.
54
+
55
+ path : str
56
+ params : RegistryManagerParams
57
+
58
+ Returns
59
+ -------
60
+ RegistryManagerBase
61
+
62
+ """
63
+ mgr = cls(path, params)
64
+ mgr.db = db
65
+ return mgr
66
+
67
+ @classmethod
68
+ def _load(cls, path, params: RegistryManagerParams, *args):
69
+ mgr = cls(path, params, *args)
70
+ return mgr
71
+
72
+ @staticmethod
73
+ @abc.abstractmethod
74
+ def config_class() -> Type:
75
+ """Return the class used for storing the config."""
76
+
77
+ @abc.abstractmethod
78
+ def get_by_id(
79
+ self, config_id: str, version: str | None = None, conn: Connection | None = None
80
+ ) -> ConfigBase:
81
+ """Get the item matching matching ID. Returns from cache if already loaded.
82
+
83
+ Parameters
84
+ ----------
85
+ config_id : str
86
+ version : str
87
+ If None, return the latest version.
88
+
89
+ Returns
90
+ -------
91
+ DSGBaseModel
92
+
93
+ Raises
94
+ ------
95
+ DSGValueNotRegistered
96
+ Raised if the ID is not stored.
97
+
98
+ """
99
+
100
+ @staticmethod
101
+ @abc.abstractmethod
102
+ def name() -> str:
103
+ """Return the name of the registry, used for reporting.
104
+
105
+ Returns
106
+ -------
107
+ str
108
+
109
+ """
110
+
111
+ @abc.abstractmethod
112
+ def register(self, *args: Any, **kwargs: Any) -> Any:
113
+ """Registers a config file in the registry.
114
+
115
+ Raises
116
+ ------
117
+ ValueError
118
+ Raised if the config_file is invalid.
119
+ DSGDuplicateValueRegistered
120
+ Raised if the config ID is already registered.
121
+
122
+ """
123
+
124
+ @abc.abstractmethod
125
+ def register_from_config(self, config: ConfigBase, *args: Any, **kwargs) -> Any:
126
+ """Registers a config file in the registry.
127
+
128
+ Raises
129
+ ------
130
+ ValueError
131
+ Raised if the config_file is invalid.
132
+ DSGDuplicateValueRegistered
133
+ Raised if the config ID is already registered.
134
+
135
+ """
136
+
137
+ @abc.abstractmethod
138
+ def update_from_file(self, *args: Any, **kwargs: Any) -> ConfigBase:
139
+ """Updates the current registry with new parameters or data from a config file.
140
+
141
+ Raises
142
+ ------
143
+ ValueError
144
+ Raised if the config_file is invalid.
145
+ DSGInvalidParameter
146
+ Raised if config_id does not match config_file.
147
+ Raised if the version is not the current version.
148
+
149
+ """
150
+
151
+ @abc.abstractmethod
152
+ def update(
153
+ self,
154
+ config: ConfigBase,
155
+ *args: Any,
156
+ **kwargs: Any,
157
+ ) -> ConfigBase:
158
+ """Updates the current registry with new parameters or data.
159
+
160
+ Raises
161
+ ------
162
+ ValueError
163
+ Raised if the config_file is invalid.
164
+ DSGInvalidParameter
165
+ Raised if config_id does not match config_file.
166
+ Raised if the version is not the current version.
167
+
168
+ """
169
+
170
+ def _check_update(
171
+ self, conn: Connection, config: ConfigBase, config_id: str, version: str
172
+ ) -> None:
173
+ if config.config_id != config_id:
174
+ msg = f"ID={config_id} does not match ID in file: {config.config_id}"
175
+ raise DSGInvalidParameter(msg)
176
+
177
+ cur_version = self.get_latest_version(config_id, conn=conn)
178
+ if version != cur_version:
179
+ msg = f"version={version} is not current. Current={cur_version}"
180
+ raise DSGInvalidParameter(msg)
181
+
182
+ @staticmethod
183
+ def get_next_version(version: str, update_type: VersionUpdateType):
184
+ ver = VersionInfo.parse(version)
185
+ if update_type == VersionUpdateType.MAJOR:
186
+ next_version = ver.bump_major()
187
+ elif update_type == VersionUpdateType.MINOR:
188
+ next_version = ver.bump_minor()
189
+ elif update_type == VersionUpdateType.PATCH:
190
+ next_version = ver.bump_patch()
191
+ else:
192
+ msg = f"invalid version {update_type=}"
193
+ raise NotImplementedError(msg)
194
+
195
+ return str(next_version)
196
+
197
+ def _update_config(self, config, context: RegistrationContext):
198
+ config_id = config.config_id
199
+ cur_version = config.model.version
200
+ new_model = copy.deepcopy(config.model)
201
+ new_model.version = self.get_next_version(cur_version, context.registration.update_type)
202
+ updated_model = self.db.update(context.connection, new_model, context.registration)
203
+ logger.info(
204
+ "Updated registry and config information for %s ID=%s version=%s",
205
+ self.name(),
206
+ config_id,
207
+ updated_model.version,
208
+ )
209
+ return updated_model
210
+
211
+ def _check_if_already_registered(self, conn: Connection, config_id):
212
+ if self.db.has(conn, config_id):
213
+ msg = f"{self.name()}={config_id}"
214
+ raise DSGDuplicateValueRegistered(msg)
215
+
216
+ def _check_if_not_registered(self, conn: Connection, config_id):
217
+ if not self.db.has(conn, config_id):
218
+ msg = f"{self.name()}={config_id}"
219
+ raise DSGValueNotRegistered(msg)
220
+
221
+ def _log_offline_mode_prefix(self):
222
+ return "* OFFLINE MODE * |" if self.offline_mode else ""
223
+
224
+ @property
225
+ def cloud_interface(self):
226
+ """Return the CloudStorageInterface to sync remote data."""
227
+ return self._params.cloud_interface
228
+
229
+ @cloud_interface.setter
230
+ def cloud_interface(self, cloud_interface):
231
+ """Set the CloudStorageInterface (used in testing)"""
232
+ self._params = self._params._replace(cloud_interface=cloud_interface)
233
+
234
+ def dump(
235
+ self,
236
+ config_id,
237
+ directory,
238
+ version=None,
239
+ conn: Connection | None = None,
240
+ force: bool = False,
241
+ ):
242
+ """Dump the config file to directory.
243
+
244
+ Parameters
245
+ ----------
246
+ config_id : str
247
+ directory : str
248
+ version : VersionInfo | None
249
+ Defaults to current version.
250
+ force : bool
251
+ If True, overwrite files if they exist.
252
+
253
+ """
254
+ path = Path(directory)
255
+ path.mkdir(exist_ok=True, parents=True)
256
+ config = self.get_by_id(config_id, version, conn=conn)
257
+ filename = config.serialize(path, force=force)
258
+ logger.info(
259
+ "Dumped config for type=%s ID=%s version=%s to %s",
260
+ self.name(),
261
+ config_id,
262
+ config.model.version,
263
+ filename,
264
+ )
265
+
266
+ def finalize_registration(self, conn: Connection, config_ids: set[str], error_occurred: bool):
267
+ """Peform final actions after a registration process.
268
+
269
+ Parameters
270
+ ----------
271
+ config_ids : set[str]
272
+ Config IDs that were registered
273
+ error_occurred : bool
274
+ Set to True if an error occurred and all intermediately-registered IDs should be
275
+ removed.
276
+ """
277
+
278
+ @property
279
+ def fs_interface(self):
280
+ """Return the FilesystemInterface to list directories and read/write files."""
281
+ return self._params.fs_interface
282
+
283
+ @property
284
+ def offline_mode(self):
285
+ """Return True if there is to be no syncing with the remote registry."""
286
+ return self._params.offline
287
+
288
+ def get_latest_version(self, config_id, conn: Connection | None = None):
289
+ """Return the current version in the registry.
290
+
291
+ Returns
292
+ -------
293
+ str
294
+
295
+ """
296
+ return self.db.get_latest_version(conn, config_id)
297
+
298
+ def get_registry_data_directory(self, config_id):
299
+ """Return the directory containing data for config_id (parquet files).
300
+
301
+ Parameters
302
+ ----------
303
+ config_id : str
304
+
305
+ Returns
306
+ -------
307
+ str
308
+
309
+ """
310
+ return Path(self._params.base_path) / "data" / config_id
311
+
312
+ def has_id(self, config_id, version=None, conn: Connection | None = None):
313
+ """Return True if an item matching the parameters is stored.
314
+
315
+ Parameters
316
+ ----------
317
+ config_id : str
318
+ version : str
319
+ If None, use latest.
320
+
321
+ Returns
322
+ -------
323
+ bool
324
+
325
+ """
326
+ return self.db.has(conn, config_id, version=version)
327
+
328
+ def iter_configs(self, conn: Connection | None = None):
329
+ """Return an iterator over the registered configs."""
330
+ for config_id in self.iter_ids(conn):
331
+ yield self.get_by_id(config_id, conn=conn)
332
+
333
+ def iter_ids(self, conn: Connection | None = None):
334
+ """Return an iterator over the registered dsgrid IDs."""
335
+ yield from self.db.list_model_ids(conn)
336
+
337
+ def list_ids(self, conn: Connection | None = None, **kwargs: Any):
338
+ """Return the IDs.
339
+
340
+ Returns
341
+ -------
342
+ list
343
+
344
+ """
345
+ return sorted(self.iter_ids(conn))
346
+
347
+ def relative_remote_path(self, path):
348
+ """Return relative remote registry path."""
349
+ relative_path = Path(path).relative_to(self._params.base_path)
350
+ remote_path = f"{self._params.remote_path}/{relative_path}"
351
+ return remote_path
352
+
353
+ @abc.abstractmethod
354
+ def remove(self, config_id: str, conn: Connection | None = None) -> None:
355
+ """Remove an item from the registry.
356
+
357
+ Parameters
358
+ ----------
359
+ config_id : str
360
+
361
+ Raises
362
+ ------
363
+ DSGValueNotRegistered
364
+ Raised if the project_id is not registered.
365
+
366
+ """
367
+ # TODO: Do we want to handle specific versions? This removes all configs.
@@ -0,0 +1,92 @@
1
+ import os
2
+ import json5
3
+
4
+ DATASET_REGISTRY_PATH = "registry/datasets/"
5
+ PROJECT_REGISTRY_PATH = "registry/projects/"
6
+ DIMENSION_REGISTRY_PATH = "registry/dimensions/"
7
+
8
+
9
+ def versioning(registry_type, id_handle, update):
10
+ """Determine registration or project version for registration.
11
+
12
+ TODO: Current solution is a quick hack. This needs to be better/formalized.
13
+ - Need smarter version updating / checks; use semvar packages
14
+ - Set to work with some central version (like S3)
15
+ - Currently only updating major version
16
+ - NOTE: not currently utilitzing the update_type in
17
+ ConfigRegistrationDetails. Could use this to set
18
+ major/minor/patch update decisiosns
19
+
20
+ Args:
21
+ registry_type (RegistryType): type of registry (e.g., Project, Dataset)
22
+ id_handle (str): ID handle is either the project_id or dataset_id
23
+ update (bool): config registration update setting
24
+ """
25
+
26
+ # get registry path
27
+ if registry_type == "dataset":
28
+ registry_path = DATASET_REGISTRY_PATH
29
+ if registry_type == "project":
30
+ registry_path = PROJECT_REGISTRY_PATH
31
+ if registry_type == "dimension":
32
+ registry_path = DIMENSION_REGISTRY_PATH
33
+
34
+ # TODO: remove when done. project path should be set somewhere else
35
+ if not os.path.exists(registry_path):
36
+ msg = f"Path does not exist: {registry_path}"
37
+ raise ValueError(msg)
38
+
39
+ # if config.update is False, then assume major=1, minor=0, patch=0
40
+ if not update:
41
+ version = f"{id_handle}-v1.0.0"
42
+ registry_file = f"{registry_path}/{version}.json5"
43
+ # Raise error if v1.0.0 registry exists for project_id
44
+ if os.path.exists(registry_file):
45
+ msg = (
46
+ f'{registry_type} registry for "{registry_file}" already '
47
+ f"exists. If you want to update the project registration"
48
+ f" with a new {registry_type} version, then you will need to"
49
+ f" set update=True in {registry_type} config. Alternatively, "
50
+ f"if you want to initiate a new dsgrid {registry_type}, you "
51
+ "will need to specify a new version handle in the "
52
+ f"{registry_type} config."
53
+ )
54
+ raise ValueError(msg)
55
+ # if update is true...
56
+ else:
57
+ # list existing project registries
58
+ existing_versions = []
59
+ for f in os.listdir(registry_path):
60
+ if f.startswith(id_handle):
61
+ existing_versions.append(int(f.split("-v")[1].split(".")[0]))
62
+ # check for existing project registries
63
+ if len(existing_versions) == 0:
64
+ msg = (
65
+ "Registration.update=True, however, no updates can be made "
66
+ f"because there are no existing registries for {registry_type}"
67
+ f" ID = {id_handle}. Check project_id or set "
68
+ f"Registration.update=True in the {registry_type} Config."
69
+ )
70
+ raise ValueError(msg)
71
+ # find the latest registry version
72
+ # NOTE: this is currently based on major verison only
73
+ last_vmajor_nbr = sorted(existing_versions)[-1]
74
+ old_project_version = f"{id_handle}-v{last_vmajor_nbr}.0.0"
75
+ old_registry_file = f"{registry_path}/{old_project_version}.json5"
76
+
77
+ # depricate old project registry
78
+ t = json5.load(old_registry_file)
79
+ t["status"] = "Deprecated"
80
+ with open(old_registry_file.format(**locals()), "w") as f:
81
+ json5.dump(t, f)
82
+
83
+ # update version
84
+ # TODO NEED REAL LOGIC FOR THIS!
85
+ # - Currently assuming only major version is being updated
86
+ major = int(last_vmajor_nbr) + 1
87
+ minor = 0 # TODO: assume 0 for now
88
+ patch = 0 # TODO: assume 0 for now
89
+
90
+ version = f"{id_handle}-v{major}.{minor}.{patch}"
91
+
92
+ return version
@@ -0,0 +1,14 @@
1
+ """Python wrapper for Rust-based pattern finding functionality."""
2
+
3
+ from dsgrid.rust_ext.find_minimal_patterns import find_minimal_patterns_from_file
4
+
5
+ try:
6
+ from dsgrid.minimal_patterns import Pattern, PatternConfig
7
+ except ImportError as e:
8
+ msg = (
9
+ "Failed to import minimal_patterns Rust extension. "
10
+ "Make sure the package was built with maturin: `pip install -e .` or `maturin develop`"
11
+ )
12
+ raise ImportError() from e
13
+
14
+ __all__ = ["Pattern", "PatternConfig", "find_minimal_patterns_from_file"]
@@ -0,0 +1,129 @@
1
+ """Python wrapper for Rust-based pattern finding functionality."""
2
+
3
+ import csv
4
+ import logging
5
+ import shutil
6
+ from collections import defaultdict
7
+ from pathlib import Path
8
+
9
+ try:
10
+ from dsgrid.minimal_patterns import Pattern, PatternConfig, find_minimal_patterns
11
+ except ImportError as e:
12
+ msg = (
13
+ "Failed to import minimal_patterns Rust extension. "
14
+ "Make sure the package was built with maturin: `pip install -e .` or `maturin develop`"
15
+ )
16
+ raise ImportError() from e
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def find_minimal_patterns_from_file(
22
+ file_path: str | Path,
23
+ max_depth: int = 0,
24
+ prune_miss_empty: bool = True,
25
+ ratio_threshold: float = 50.0,
26
+ threads: int = 0,
27
+ verbose: bool = False,
28
+ output_dir: str | Path | None = "missing_associations",
29
+ ) -> list[Pattern]:
30
+ """Find minimal closed patterns in a Parquet file containing categorical data.
31
+
32
+ This function analyzes a Parquet file to discover minimal closed patterns -
33
+ the simplest column combinations that characterize complete subsets of your data.
34
+ Patterns are grouped by their column combinations and written to CSV files.
35
+
36
+ Parameters
37
+ ----------
38
+ file_path : str | Path
39
+ Path to the input Parquet file
40
+ max_depth : int, optional
41
+ Maximum pattern size (number of columns). 0 = unlimited. Default: 0.
42
+ prune_miss_empty : bool, optional
43
+ Prune patterns with no matching rows (recommended: True). Default: True.
44
+ ratio_threshold : float, optional
45
+ Ratio threshold for pruning. Default: 50.0.
46
+ threads : int, optional
47
+ Number of threads to use (0 = use all available cores). Default: 0.
48
+ verbose : bool, optional
49
+ Enable verbose progress output. Default: False.
50
+ output_dir : str | Path | None, optional
51
+ Directory to write CSV files grouping patterns by column combinations.
52
+ Each unique combination of columns produces a separate CSV file named
53
+ ``<col1>__<col2>__...__<colN>.csv``. If None, no files are written.
54
+ Default: "missing_associations".
55
+
56
+ Returns
57
+ -------
58
+ list[Pattern]
59
+ List of Pattern objects, each containing:
60
+
61
+ - pattern_id : Unique identifier
62
+ - columns : List of column names in the pattern
63
+ - values : List of values for each column
64
+ - num_rows : Number of rows matching this pattern
65
+
66
+ Raises
67
+ ------
68
+ RuntimeError
69
+ If there's an error reading the Parquet file or finding patterns.
70
+
71
+ Examples
72
+ --------
73
+ >>> from dsgrid.rust_ext import find_patterns_from_parquet
74
+ >>> patterns = find_patterns_from_parquet("missing_records.parquet", max_depth=3, verbose=True)
75
+ >>> for p in patterns:
76
+ ... print(f"Pattern {p.pattern_id}: {p.columns} = {p.values} ({p.num_rows} rows)")
77
+ """
78
+ config = PatternConfig(
79
+ max_depth=max_depth,
80
+ prune_miss_empty=prune_miss_empty,
81
+ ratio_threshold=ratio_threshold,
82
+ threads=threads,
83
+ verbose=verbose,
84
+ )
85
+
86
+ parquet_path_str = str(file_path)
87
+ logger.info("Finding minimal closed patterns in %s", parquet_path_str)
88
+
89
+ patterns = find_minimal_patterns(parquet_path_str, config)
90
+
91
+ logger.info("Found %d minimal closed patterns", len(patterns))
92
+
93
+ if output_dir is not None:
94
+ _write_patterns_to_csv(patterns, output_dir)
95
+
96
+ return patterns
97
+
98
+
99
+ def _write_patterns_to_csv(patterns: list[Pattern], output_dir: str | Path) -> None:
100
+ """Write patterns to CSV files grouped by column combinations.
101
+
102
+ Parameters
103
+ ----------
104
+ patterns : list[Pattern]
105
+ List of Pattern objects to write.
106
+ output_dir : str | Path
107
+ Directory to write CSV files to.
108
+ """
109
+ output_path = Path(output_dir)
110
+ if output_path.exists():
111
+ shutil.rmtree(output_path)
112
+ output_path.mkdir(parents=True)
113
+
114
+ grouped: dict[tuple[str, ...], list[Pattern]] = defaultdict(list)
115
+ for pattern in patterns:
116
+ key = tuple(pattern.columns)
117
+ grouped[key].append(pattern)
118
+
119
+ for columns, group_patterns in grouped.items():
120
+ filename = "__".join(columns) + ".csv"
121
+ filepath = output_path / filename
122
+
123
+ with open(filepath, "w", newline="") as f:
124
+ writer = csv.writer(f)
125
+ writer.writerow(list(columns))
126
+ for pattern in group_patterns:
127
+ writer.writerow(pattern.values)
128
+
129
+ logger.info("Wrote %d patterns to %s", len(group_patterns), filepath)
File without changes