deltacat 2.0.0b9__py3-none-any.whl → 2.0.0b11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. deltacat/__init__.py +41 -16
  2. deltacat/api.py +478 -123
  3. deltacat/aws/s3u.py +2 -2
  4. deltacat/benchmarking/benchmark_engine.py +4 -2
  5. deltacat/benchmarking/conftest.py +1 -1
  6. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  7. deltacat/catalog/__init__.py +62 -5
  8. deltacat/catalog/main/impl.py +26 -10
  9. deltacat/catalog/model/catalog.py +165 -109
  10. deltacat/catalog/model/properties.py +25 -24
  11. deltacat/compute/__init__.py +14 -0
  12. deltacat/compute/converter/constants.py +5 -0
  13. deltacat/compute/converter/converter_session.py +78 -36
  14. deltacat/compute/converter/model/convert_input.py +24 -4
  15. deltacat/compute/converter/model/convert_result.py +61 -0
  16. deltacat/compute/converter/model/converter_session_params.py +52 -10
  17. deltacat/compute/converter/pyiceberg/overrides.py +181 -62
  18. deltacat/compute/converter/steps/convert.py +84 -36
  19. deltacat/compute/converter/steps/dedupe.py +25 -4
  20. deltacat/compute/converter/utils/convert_task_options.py +42 -13
  21. deltacat/compute/converter/utils/iceberg_columns.py +5 -0
  22. deltacat/compute/converter/utils/io.py +82 -11
  23. deltacat/compute/converter/utils/s3u.py +13 -4
  24. deltacat/compute/jobs/client.py +406 -0
  25. deltacat/constants.py +5 -6
  26. deltacat/env.py +10 -0
  27. deltacat/examples/basic_logging.py +6 -6
  28. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  29. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  30. deltacat/examples/hello_world.py +4 -2
  31. deltacat/examples/indexer/indexer.py +163 -0
  32. deltacat/examples/indexer/job_runner.py +198 -0
  33. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  34. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  35. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +27 -9
  36. deltacat/{storage → experimental/storage}/iceberg/iceberg_scan_planner.py +1 -1
  37. deltacat/{storage → experimental/storage}/iceberg/impl.py +1 -1
  38. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  39. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  40. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -9
  41. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  42. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  43. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  44. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  45. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  46. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  47. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  48. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  49. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -1
  50. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  51. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  52. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  53. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  54. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  55. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  56. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  57. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  58. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  59. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +4 -4
  60. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  61. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  62. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  63. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  64. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  65. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  66. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  67. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  68. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  69. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  70. deltacat/io/__init__.py +13 -0
  71. deltacat/io/dataset/__init__.py +0 -0
  72. deltacat/io/dataset/deltacat_dataset.py +91 -0
  73. deltacat/io/datasink/__init__.py +0 -0
  74. deltacat/io/datasink/deltacat_datasink.py +207 -0
  75. deltacat/io/datasource/__init__.py +0 -0
  76. deltacat/io/datasource/deltacat_datasource.py +580 -0
  77. deltacat/io/reader/__init__.py +0 -0
  78. deltacat/io/reader/deltacat_read_api.py +172 -0
  79. deltacat/storage/__init__.py +2 -0
  80. deltacat/storage/model/expression/__init__.py +47 -0
  81. deltacat/storage/model/expression/expression.py +656 -0
  82. deltacat/storage/model/expression/visitor.py +248 -0
  83. deltacat/storage/model/metafile.py +74 -42
  84. deltacat/storage/model/scan/push_down.py +32 -5
  85. deltacat/storage/model/shard.py +6 -2
  86. deltacat/storage/model/types.py +5 -3
  87. deltacat/tests/_io/reader/__init__.py +0 -0
  88. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  89. deltacat/tests/catalog/data/__init__.py +0 -0
  90. deltacat/tests/catalog/main/__init__.py +0 -0
  91. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  92. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +436 -0
  93. deltacat/tests/catalog/model/__init__.py +0 -0
  94. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  95. deltacat/tests/catalog/test_catalogs.py +52 -98
  96. deltacat/tests/catalog/test_default_catalog_impl.py +1 -2
  97. deltacat/tests/compute/converter/test_convert_session.py +209 -46
  98. deltacat/tests/daft/__init__.py +0 -0
  99. deltacat/tests/daft/test_model.py +97 -0
  100. deltacat/tests/experimental/__init__.py +0 -0
  101. deltacat/tests/experimental/catalog/__init__.py +0 -0
  102. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  103. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  104. deltacat/tests/experimental/daft/__init__.py +0 -0
  105. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  106. deltacat/tests/experimental/storage/__init__.py +0 -0
  107. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  108. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  109. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  110. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  111. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  112. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  113. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  114. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  115. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  116. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  117. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  118. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  119. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  120. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  121. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  122. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  123. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  124. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  125. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  126. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  127. deltacat/tests/local_deltacat_storage/__init__.py +1 -0
  128. deltacat/tests/storage/model/test_expression.py +327 -0
  129. deltacat/tests/storage/model/test_shard.py +3 -1
  130. deltacat/tests/test_deltacat_api.py +50 -9
  131. deltacat/types/media.py +141 -43
  132. deltacat/types/tables.py +35 -7
  133. deltacat/utils/daft.py +531 -5
  134. deltacat/utils/export.py +3 -1
  135. deltacat/utils/filesystem.py +39 -9
  136. deltacat/utils/polars.py +128 -0
  137. deltacat/utils/pyarrow.py +151 -15
  138. deltacat/utils/ray_utils/concurrency.py +1 -1
  139. deltacat/utils/ray_utils/runtime.py +56 -4
  140. deltacat/utils/url.py +1284 -0
  141. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/METADATA +11 -9
  142. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/RECORD +168 -123
  143. deltacat/catalog/iceberg/__init__.py +0 -4
  144. deltacat/daft/daft_scan.py +0 -111
  145. deltacat/daft/model.py +0 -258
  146. deltacat/examples/common/fixtures.py +0 -15
  147. deltacat/storage/rivulet/__init__.py +0 -11
  148. deltacat/storage/rivulet/feather/__init__.py +0 -5
  149. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  150. /deltacat/{daft → compute/jobs}/__init__.py +0 -0
  151. /deltacat/examples/{common → experimental}/__init__.py +0 -0
  152. /deltacat/examples/{iceberg → experimental/iceberg}/__init__.py +0 -0
  153. /deltacat/{storage/iceberg → examples/indexer}/__init__.py +0 -0
  154. /deltacat/{storage/rivulet/arrow → examples/indexer/aws}/__init__.py +0 -0
  155. /deltacat/{storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  156. /deltacat/{storage/rivulet/metastore → experimental/catalog}/__init__.py +0 -0
  157. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  158. /deltacat/{storage/rivulet/reader → experimental/storage}/__init__.py +0 -0
  159. /deltacat/{storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  160. /deltacat/{storage → experimental/storage}/iceberg/model.py +0 -0
  161. /deltacat/{storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  162. /deltacat/{tests/storage/rivulet → experimental/storage/rivulet/fs}/__init__.py +0 -0
  163. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  164. /deltacat/{tests/storage/rivulet/fs → experimental/storage/rivulet/metastore}/__init__.py +0 -0
  165. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  166. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  167. /deltacat/{storage → experimental/storage}/rivulet/parquet/data_reader.py +0 -0
  168. /deltacat/{tests/storage/rivulet/schema → experimental/storage/rivulet/reader}/__init__.py +0 -0
  169. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  170. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/schema}/__init__.py +0 -0
  171. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  172. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  173. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/LICENSE +0 -0
  174. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/WHEEL +0 -0
  175. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/top_level.txt +0 -0
@@ -4,9 +4,7 @@ import deltacat as dc
4
4
 
5
5
  from deltacat import logs
6
6
  from deltacat import IcebergCatalog
7
- from deltacat.examples.common.fixtures import (
8
- store_cli_args_in_os_environ,
9
- )
7
+ from env import store_cli_args_in_os_environ
10
8
 
11
9
  from pyiceberg.schema import (
12
10
  Schema,
@@ -22,7 +20,7 @@ from pyiceberg.transforms import DayTransform, IdentityTransform
22
20
  from pyiceberg.table.sorting import SortField, SortOrder
23
21
 
24
22
  from deltacat.exceptions import TableAlreadyExistsError
25
- from deltacat.storage.iceberg.model import (
23
+ from deltacat.experimental.storage.iceberg.model import (
26
24
  SchemaMapper,
27
25
  PartitionSchemeMapper,
28
26
  SortSchemeMapper,
@@ -1,12 +1,10 @@
1
1
  import ray
2
2
  import deltacat
3
3
  import daft
4
- import pyiceberg
5
4
 
6
5
 
7
6
  def print_package_version_info():
8
7
  print(f"DeltaCAT Version: {deltacat.__version__}")
9
- print(f"PyIceberg Version: {pyiceberg.__version__}")
10
8
  print(f"Ray Version: {ray.__version__}")
11
9
  print(f"Daft Version: {daft.__version__}")
12
10
 
@@ -24,4 +22,8 @@ def run():
24
22
 
25
23
 
26
24
  if __name__ == "__main__":
25
+ # initialize deltacat
26
+ deltacat.init()
27
+
28
+ # run the example
27
29
  run()
@@ -0,0 +1,163 @@
1
+ import argparse
2
+
3
+ from datetime import datetime
4
+
5
+ import ray
6
+
7
+ import deltacat
8
+ import daft
9
+ import pyarrow as pa
10
+ import pandas as pd
11
+ import polars as pl
12
+ import numpy as np
13
+
14
+ from deltacat import DeltaCatUrl
15
+
16
+
17
+ def print_package_version_info() -> None:
18
+ print(f"DeltaCAT Version: {deltacat.__version__}")
19
+ print(f"Ray Version: {ray.__version__}")
20
+ print(f"Daft Version: {daft.__version__}")
21
+ print(f"NumPy Version: {np.__version__}")
22
+ print(f"PyArrow Version: {pa.__version__}")
23
+ print(f"Polars Version: {pl.__version__}")
24
+ print(f"Pandas Version: {pd.__version__}")
25
+
26
+
27
+ def json_path_to_regex(path: str):
28
+ if not path:
29
+ raise ValueError("Path cannot be empty")
30
+ parts = path.split("/")
31
+ leaf_key = parts.pop()
32
+ regex = r""
33
+ for part in parts:
34
+ if part.strip(): # discard leading and/or redundant separators
35
+ regex += rf'"{part}"\s*:\s*[{{\[].*?'
36
+ regex += rf'"{leaf_key}"\s*:\s*"(?<{leaf_key}>.*?)"'
37
+ return regex
38
+
39
+
40
+ def run(
41
+ source: str,
42
+ dest: str,
43
+ ) -> None:
44
+ # print package version info
45
+ print_package_version_info()
46
+
47
+ # run a synchronous copy from the source to the destination
48
+ deltacat.copy(
49
+ DeltaCatUrl(source),
50
+ DeltaCatUrl(dest),
51
+ # reader arguments to pass to the default reader (polars)
52
+ # for the given text-based datasource, it accepts the same
53
+ # arguments as polars.read_csv except for `source`, `n_threads`
54
+ # `new_columns`, `separator`, `has_header`, `quote_char`, and
55
+ # `infer_schema`.
56
+ reader_args={
57
+ "low_memory": True, # try to use less memory (++stability, --perf)
58
+ "batch_size": 1024, # text line count read into a buffer at once
59
+ "use_pyarrow": True, # use the native pyarrow reader
60
+ },
61
+ # writer arguments to pass to the default writer (polars)
62
+ # for the given parquet-based datasink, it generally accepts the same
63
+ # arguments as polars.DataFrame.write_{dest-type} except for `file`
64
+ writer_args={
65
+ "compression": "lz4", # faster compression & decompression
66
+ # "compression": "zstd", # better compression ratio
67
+ # "compression": "snappy", # compatible w/ older Parquet readers
68
+ },
69
+ # Transforms to run against the default polars dataframe read.
70
+ # By default, each transform takes a polars dataframe `df` as input
71
+ # and produces a polars dataframe as output. All transforms listed
72
+ # are run in order (i.e., the dataframe output from transform[0]
73
+ # is the dataframe input to transform[1]).
74
+ #
75
+ # See:
76
+ # https://docs.pola.rs/api/python/stable/reference/dataframe/index.html
77
+ # https://docs.pola.rs/api/python/stable/reference/expressions/index.html
78
+ transforms=[
79
+ lambda df, src: df.rename(
80
+ {"text": "utf8_body"},
81
+ ),
82
+ lambda df, src: df.with_columns(
83
+ pl.col("utf8_body").hash().alias("utf8_body_hash"),
84
+ pl.lit(datetime.utcnow()).dt.datetime().alias("processing_time"),
85
+ pl.lit(src.url_path).alias("source_file_path"),
86
+ ),
87
+ ],
88
+ )
89
+
90
+
91
+ if __name__ == "__main__":
92
+ """
93
+ Example 1: Run this script locally using Ray:
94
+ $ python indexer.py \
95
+ $ --source 'text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31' \
96
+ $ --dest 'parquet+s3://deltacat-example-output/openalex/PaperAbstractsInvertedIndex.part31.parquet'
97
+
98
+ Example 2: Submit this script as a local Ray job using a local job client:
99
+ >>> from deltacat import local_job_client
100
+ >>> client = local_job_client()
101
+ >>> # read the source file as line-delimited text
102
+ >>> src = "text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31"
103
+ >>> # write to the destination file using the default DeltaCAT Parquet writer (i.e., polars.DataFrame.write_parquet)
104
+ >>> dst = "parquet+s3://deltacat-example-output/openalex/PaperAbstractsInvertedIndex.part31.parquet"
105
+ >>> try:
106
+ >>> job_run_result = client.run_job(
107
+ >>> # Entrypoint shell command to run the indexer job
108
+ >>> entrypoint=f"python indexer.py --source '{src}' --dest '{dst}'",
109
+ >>> # Path to the local directory that contains the indexer.py file
110
+ >>> runtime_env={"working_dir": "./deltacat/examples/indexer.py"},
111
+ >>> )
112
+ >>> print(f"Job ID {job_run_result.job_id} terminal state: {job_run_result.job_status}")
113
+ >>> print(f"Job ID {job_run_result.job_id} logs: ")
114
+ >>> print(job_run_result.job_logs)
115
+ >>> except RuntimeError as e:
116
+ >>> print(f"Job Run Failed: {e}")
117
+ >>> except TimeoutError as e:
118
+ >>> print(f"Job Run Timed Out: {e}")
119
+
120
+ Example 3: Submit this script as a remote Ray job using a remote job client:
121
+ >>> from deltacat import job_client
122
+ >>> # use `deltacat.yaml` from the current working directory as the ray cluster launcher config file
123
+ >>> # automatically launches the cluster if it doesn't exist or has died
124
+ >>> # automatically forwards the ray cluster's dashboard for viewing in a web browser @ http://localhost:8265
125
+ >>> client = job_client()
126
+ >>> # ... follow the same steps as above to submit a synchronous indexer job ...
127
+ >>>
128
+ >>> # OR use an explicit cluster launcher config file path
129
+ >>> client = job_client("/Users/pdames/workspace/deltacat.yaml")
130
+ >>> # ... follow the same steps as above to submit a synchronous indexer job ...
131
+ """
132
+ script_args = [
133
+ (
134
+ [
135
+ "--source",
136
+ ],
137
+ {
138
+ "help": "Source DeltaCAT URL to index.",
139
+ "type": str,
140
+ },
141
+ ),
142
+ (
143
+ [
144
+ "--dest",
145
+ ],
146
+ {
147
+ "help": "Destination DeltaCAT URL to index.",
148
+ "type": str,
149
+ },
150
+ ),
151
+ ]
152
+ # parse CLI input arguments
153
+ parser = argparse.ArgumentParser()
154
+ for args, kwargs in script_args:
155
+ parser.add_argument(*args, **kwargs)
156
+ args = parser.parse_args()
157
+ print(f"Command Line Arguments: {args}")
158
+
159
+ # initialize deltacat
160
+ deltacat.init()
161
+
162
+ # run the example using the parsed arguments
163
+ run(**vars(args))
@@ -0,0 +1,198 @@
1
+ import argparse
2
+ import pathlib
3
+
4
+ from deltacat.compute import (
5
+ job_client,
6
+ JobStatus,
7
+ )
8
+
9
+
10
+ def run_async(
11
+ source: str,
12
+ dest: str,
13
+ jobs_to_submit: int,
14
+ job_timeout: int,
15
+ cloud: str,
16
+ restart_ray: bool,
17
+ ):
18
+ # print package version info
19
+ working_dir = pathlib.Path(__file__).parent
20
+ cluster_cfg_file_path = working_dir.joinpath(cloud).joinpath("deltacat.yaml")
21
+ job_number = 0
22
+ client = job_client(cluster_cfg_file_path, restart_ray=restart_ray)
23
+ job_ids = []
24
+ while jobs_to_submit > 0:
25
+ jobs_to_submit -= 1
26
+ job_dest = dest + f".{job_number}"
27
+ job_id = client.submit_job(
28
+ # Entrypoint shell command to execute
29
+ entrypoint=f"python3 indexer.py --source '{source}' --dest '{job_dest}'",
30
+ # Path to the local directory that contains the indexer.py file
31
+ # This entire directory will be zipped into a job package, so keep
32
+ # it small.
33
+ runtime_env={"working_dir": working_dir},
34
+ )
35
+ job_ids.append(job_id)
36
+ job_number += 1
37
+
38
+ print("Waiting for all jobs to complete...")
39
+ job_number = 0
40
+ all_job_logs = ""
41
+ for job_id in job_ids:
42
+ job_status = client.await_job(job_id, timeout_seconds=job_timeout)
43
+ if job_status != JobStatus.SUCCEEDED:
44
+ print(f"Job `{job_id}` logs: ")
45
+ print(client.get_job_logs(job_id))
46
+ raise RuntimeError(f"Job `{job_id}` terminated with status: {job_status}")
47
+ all_job_logs += f"\nJob #{job_number} logs: \n"
48
+ all_job_logs += client.get_job_logs(job_id)
49
+ job_number += 1
50
+ print("All jobs completed!")
51
+ print("Job Logs: ")
52
+ print(all_job_logs)
53
+
54
+
55
+ def run_sync(
56
+ source: str,
57
+ dest: str,
58
+ jobs_to_submit: int,
59
+ job_timeout: int,
60
+ cloud: str,
61
+ restart_ray: bool,
62
+ ):
63
+ working_dir = pathlib.Path(__file__).parent
64
+ cluster_cfg_file_path = working_dir.joinpath(cloud).joinpath("deltacat.yaml")
65
+ client = job_client(cluster_cfg_file_path, restart_ray=restart_ray)
66
+ job_number = 0
67
+ while job_number < jobs_to_submit:
68
+ job_dest = dest + f".{job_number}"
69
+ job_run_result = client.run_job(
70
+ # Entrypoint shell command to execute
71
+ entrypoint=f"python3 indexer.py --source '{source}' --dest '{job_dest}'",
72
+ # Path to the local directory that contains the indexer.py file
73
+ # This entire directory will be zipped into a job package, so keep
74
+ # it small.
75
+ runtime_env={"working_dir": working_dir},
76
+ timeout_seconds=job_timeout,
77
+ )
78
+ print(
79
+ f"Job ID {job_run_result.job_id} terminal state: {job_run_result.job_status}"
80
+ )
81
+ print(f"Job ID {job_run_result.job_id} logs: ")
82
+ print(job_run_result.job_logs)
83
+ job_number += 1
84
+
85
+
86
+ def run(
87
+ source: str,
88
+ dest: str,
89
+ restart_ray: bool,
90
+ jobs_to_submit: int,
91
+ job_timeout: int,
92
+ asynchronous: bool,
93
+ cloud_provider: str,
94
+ ):
95
+ run_func = run_async if asynchronous else run_sync
96
+ run_func(
97
+ source=source,
98
+ dest=dest,
99
+ jobs_to_submit=jobs_to_submit,
100
+ job_timeout=job_timeout,
101
+ cloud=cloud_provider,
102
+ restart_ray=restart_ray,
103
+ )
104
+
105
+
106
+ if __name__ == "__main__":
107
+ """
108
+ # Run this example through a command of the form:
109
+ $ python ./deltacat/examples/job_runner.py -- \
110
+ $ --source text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31 \
111
+ $ --dest parquet+s3://deltacat-example-output/openalex/PaperAbstractsInvertedIndex.part31.parquet \
112
+ $ --asynchronous \
113
+ $ --jobs-to-submit 100 \
114
+ $ --job-timeout 90 \
115
+ $ --cloud-provider aws
116
+ """
117
+ script_args = [
118
+ (
119
+ [
120
+ "--source",
121
+ ],
122
+ {
123
+ "help": "Source DeltaCAT URL to index.",
124
+ "type": str,
125
+ "default": "text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31",
126
+ },
127
+ ),
128
+ (
129
+ [
130
+ "--dest",
131
+ ],
132
+ {
133
+ "help": "Destination DeltaCAT URL to store the indexed file.",
134
+ "type": str,
135
+ "default": "parquet+s3://deltacat-example-output/openalex/PaperAbstractsInvertedIndex.part31.parquet",
136
+ },
137
+ ),
138
+ (
139
+ [
140
+ "--restart-ray",
141
+ ],
142
+ {
143
+ "help": "Restart Ray on an existing cluster.",
144
+ "action": "store_true",
145
+ "default": False,
146
+ },
147
+ ),
148
+ (
149
+ [
150
+ "--asynchronous",
151
+ ],
152
+ {
153
+ "help": "Run jobs asynchronously.",
154
+ "action": "store_true",
155
+ "default": False,
156
+ },
157
+ ),
158
+ (
159
+ [
160
+ "--jobs-to-submit",
161
+ ],
162
+ {
163
+ "help": "Number of indexer jobs to submit for execution.",
164
+ "type": int,
165
+ "default": 1,
166
+ },
167
+ ),
168
+ (
169
+ [
170
+ "--job-timeout",
171
+ ],
172
+ {
173
+ "help": "Job timeout in seconds.",
174
+ "type": int,
175
+ "default": 300,
176
+ },
177
+ ),
178
+ (
179
+ [
180
+ "--cloud-provider",
181
+ ],
182
+ {
183
+ "help": "Ray Cluster Cloud Provider ('aws' or 'gcp')",
184
+ "type": str,
185
+ "default": "aws",
186
+ },
187
+ ),
188
+ ]
189
+
190
+ # parse CLI input arguments
191
+ parser = argparse.ArgumentParser()
192
+ for args, kwargs in script_args:
193
+ parser.add_argument(*args, **kwargs)
194
+ args = parser.parse_args()
195
+ print(f"Command Line Arguments: {args}")
196
+
197
+ # run the example using os.environ as kwargs
198
+ run(**vars(args))
@@ -0,0 +1,6 @@
1
+ from deltacat.experimental.catalog.iceberg.iceberg_catalog_config import (
2
+ IcebergCatalogConfig,
3
+ )
4
+ import deltacat.experimental.catalog.iceberg.impl as IcebergCatalog
5
+
6
+ __all__ = ["IcebergCatalogConfig", "IcebergCatalog"]
@@ -15,7 +15,7 @@ class IcebergCatalogConfig:
15
15
 
16
16
  This configuration is passed through to PyIceberg by invoking load_catalog.
17
17
  The Properties provided must match properties accepted by PyIceberg for each catalog type
18
- See: :func:`deltacat.catalog.iceberg.initialize`
18
+ See: :func:`deltacat.experimental.catalog.iceberg.initialize`
19
19
 
20
20
  Attributes:
21
21
  type: The PyIceberg Catalog instance
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import sys
2
3
 
3
4
  from typing import Any, Dict, List, Optional, Union
4
5
 
@@ -7,13 +8,19 @@ from daft.daft import ScanOperatorHandle, StorageConfig
7
8
  from daft.logical.builder import LogicalPlanBuilder
8
9
 
9
10
  from deltacat import logs
11
+ from deltacat.catalog.model.catalog import Catalog
10
12
  from deltacat.catalog.model.table_definition import TableDefinition
11
- from deltacat.daft.daft_scan import DeltaCatScanOperator
13
+ from deltacat.utils.daft import DeltaCatScanOperator
12
14
  from deltacat.exceptions import TableAlreadyExistsError
13
- from deltacat.storage.iceberg.iceberg_scan_planner import IcebergScanPlanner
14
- from deltacat.storage.iceberg.model import PartitionSchemeMapper, SchemaMapper
15
+ from deltacat.experimental.storage.iceberg.iceberg_scan_planner import (
16
+ IcebergScanPlanner,
17
+ )
18
+ from deltacat.experimental.storage.iceberg.model import (
19
+ PartitionSchemeMapper,
20
+ SchemaMapper,
21
+ )
15
22
  from deltacat.storage.model.partition import PartitionScheme
16
- from deltacat.storage.iceberg.impl import _get_native_catalog
23
+ from deltacat.experimental.storage.iceberg.impl import _get_native_catalog
17
24
  from deltacat.storage.model.sort_key import SortScheme
18
25
  from deltacat.storage.model.list_result import ListResult
19
26
  from deltacat.storage.model.namespace import Namespace, NamespaceProperties
@@ -26,20 +33,31 @@ from deltacat.storage.model.types import (
26
33
  LocalTable,
27
34
  StreamFormat,
28
35
  )
29
- from deltacat.storage.iceberg import impl as IcebergStorage
36
+ from deltacat.experimental.storage.iceberg import impl as IcebergStorage
30
37
  from deltacat.types.media import ContentType
31
38
  from deltacat.types.tables import TableWriteMode
32
39
  from deltacat.constants import DEFAULT_NAMESPACE
33
- from deltacat.catalog.iceberg.iceberg_catalog_config import IcebergCatalogConfig
40
+ from deltacat.experimental.catalog.iceberg.iceberg_catalog_config import (
41
+ IcebergCatalogConfig,
42
+ )
34
43
 
35
- from pyiceberg.catalog import Catalog, load_catalog
44
+ from pyiceberg.catalog import Catalog as PyIcebergCatalog, load_catalog
36
45
  from pyiceberg.transforms import BucketTransform
37
46
 
38
47
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
39
48
 
49
+ IcebergCatalog = sys.modules[__name__]
50
+
51
+
52
+ def from_config(config: IcebergCatalogConfig, *args, **kwargs) -> Catalog:
53
+ """
54
+ Factory method to construct a catalog from Iceberg catalog configuration.
55
+ """
56
+ return Catalog(config, impl=IcebergCatalog, *args, **kwargs)
57
+
40
58
 
41
59
  # catalog functions
42
- def initialize(*args, config: IcebergCatalogConfig, **kwargs) -> Catalog:
60
+ def initialize(config: IcebergCatalogConfig, **kwargs) -> PyIcebergCatalog:
43
61
  """
44
62
  Initializes an Iceberg catalog with the given config.
45
63
 
@@ -123,7 +141,7 @@ def write_to_table(
123
141
  )
124
142
  # TODO(pdames): only append s3:// to output file paths when writing to S3!
125
143
  out_file_paths = [f"s3://{val}" for val in out_df.to_arrow()[0]]
126
- from deltacat.catalog.iceberg import overrides
144
+ from deltacat.experimental.catalog.iceberg import overrides
127
145
 
128
146
  overrides.append(
129
147
  table_definition.table.native_object,
@@ -5,7 +5,7 @@ from deltacat.storage.model.scan.push_down import Pushdown
5
5
  from deltacat.storage.model.scan.scan_plan import ScanPlan
6
6
  from deltacat.storage.model.scan.scan_task import FileScanTask, DataFile
7
7
  from deltacat.storage.util.scan_planner import ScanPlanner
8
- from deltacat.storage.iceberg.impl import _try_load_iceberg_table
8
+ from deltacat.experimental.storage.iceberg.impl import _try_load_iceberg_table
9
9
 
10
10
 
11
11
  class IcebergScanPlanner(ScanPlanner):
@@ -32,7 +32,7 @@ from deltacat.storage import (
32
32
  NamespaceProperties,
33
33
  )
34
34
  from deltacat.storage.model.manifest import Manifest
35
- from deltacat.storage.iceberg.model import (
35
+ from deltacat.experimental.storage.iceberg.model import (
36
36
  SchemaMapper,
37
37
  PartitionSchemeMapper,
38
38
  SortSchemeMapper,
@@ -0,0 +1,11 @@
1
+ from deltacat.experimental.storage.rivulet.schema.schema import Schema
2
+ from deltacat.experimental.storage.rivulet.schema.schema import Field
3
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
4
+ from deltacat.experimental.storage.rivulet.schema.schema import Datatype
5
+
6
+ __all__ = [
7
+ "Schema",
8
+ "Field",
9
+ "Dataset",
10
+ "Datatype",
11
+ ]
@@ -2,10 +2,13 @@ from abc import ABC, abstractmethod
2
2
  from typing import Iterator, List, Any
3
3
  import pyarrow as pa
4
4
 
5
- from deltacat.storage.rivulet.metastore.sst import SSTableRow
6
- from deltacat.storage.rivulet import Schema
7
- from deltacat.storage.rivulet.serializer import DataSerializer, MEMTABLE_DATA
8
- from deltacat.storage.rivulet.fs.file_provider import FileProvider
5
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
6
+ from deltacat.experimental.storage.rivulet import Schema
7
+ from deltacat.experimental.storage.rivulet.serializer import (
8
+ DataSerializer,
9
+ MEMTABLE_DATA,
10
+ )
11
+ from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
9
12
 
10
13
 
11
14
  class ArrowSerializer(DataSerializer, ABC):
@@ -24,19 +24,23 @@ from deltacat.storage.model.shard import Shard, ShardingStrategy
24
24
  from deltacat.storage.model.stream import Stream, StreamLocator
25
25
  from deltacat.storage.model.transaction import TransactionOperationList
26
26
  from deltacat.storage.model.types import CommitState, StreamFormat
27
- from deltacat.storage.rivulet.fs.file_store import FileStore
28
- from deltacat.storage.rivulet.fs.file_provider import FileProvider
29
- from deltacat.storage.rivulet.reader.dataset_metastore import DatasetMetastore
30
- from deltacat.storage.rivulet import Schema, Field
27
+ from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
28
+ from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
29
+ from deltacat.experimental.storage.rivulet.reader.dataset_metastore import (
30
+ DatasetMetastore,
31
+ )
32
+ from deltacat.experimental.storage.rivulet import Schema, Field
31
33
  from deltacat.utils.export import export_dataset
32
34
  from .schema.schema import Datatype
33
35
 
34
- from deltacat.storage.rivulet.reader.data_scan import DataScan
35
- from deltacat.storage.rivulet.reader.dataset_reader import DatasetReader
36
- from deltacat.storage.rivulet.reader.query_expression import QueryExpression
36
+ from deltacat.experimental.storage.rivulet.reader.data_scan import DataScan
37
+ from deltacat.experimental.storage.rivulet.reader.dataset_reader import DatasetReader
38
+ from deltacat.experimental.storage.rivulet.reader.query_expression import (
39
+ QueryExpression,
40
+ )
37
41
 
38
- from deltacat.storage.rivulet.writer.dataset_writer import DatasetWriter
39
- from deltacat.storage.rivulet.writer.memtable_dataset_writer import (
42
+ from deltacat.experimental.storage.rivulet.writer.dataset_writer import DatasetWriter
43
+ from deltacat.experimental.storage.rivulet.writer.memtable_dataset_writer import (
40
44
  MemtableDatasetWriter,
41
45
  )
42
46
 
@@ -2,13 +2,16 @@ from __future__ import annotations
2
2
 
3
3
  from typing import List, Callable, Any
4
4
 
5
- from deltacat.storage.rivulet.field_group import FieldGroup
6
- from deltacat.storage.rivulet.mvp.Table import MvpTable
7
- from deltacat.storage.rivulet import Schema
8
- from deltacat.storage.rivulet.reader.data_scan import DataScan
9
- from deltacat.storage.rivulet.reader.dataset_metastore import DatasetMetastore
10
- from deltacat.storage.rivulet.reader.dataset_reader import DatasetReader
11
- from deltacat.storage.rivulet.reader.query_expression import QueryExpression
5
+ from deltacat.experimental.storage.rivulet.mvp.Table import MvpTable
6
+ from deltacat.experimental.storage.rivulet import Schema
7
+ from deltacat.experimental.storage.rivulet.reader.data_scan import DataScan
8
+ from deltacat.experimental.storage.rivulet.reader.dataset_metastore import (
9
+ DatasetMetastore,
10
+ )
11
+ from deltacat.experimental.storage.rivulet.reader.dataset_reader import DatasetReader
12
+ from deltacat.experimental.storage.rivulet.reader.query_expression import (
13
+ QueryExpression,
14
+ )
12
15
 
13
16
 
14
17
  class DatasetExecutor:
@@ -22,12 +25,10 @@ class DatasetExecutor:
22
25
 
23
26
  def __init__(
24
27
  self,
25
- field_groups: List[FieldGroup],
26
28
  schema: Schema,
27
29
  metastore: DatasetMetastore,
28
30
  ):
29
31
  self.effective_schema: Schema = schema.__deepcopy__()
30
- self.field_groups = field_groups
31
32
  self.output: MvpTable | None = None
32
33
  self._metastore = metastore
33
34
 
@@ -64,18 +65,9 @@ class DatasetExecutor:
64
65
 
65
66
  TODO for now this is doing dumb in-memory implementation and later this is going to be replaced by rust library
66
67
  """
67
- if len(self.field_groups) == 1:
68
- return self._read_as_mvp_table(schema, self.field_groups[0])
69
- else:
70
- ds1 = self._read_as_mvp_table(schema, self.field_groups[0])
71
- ds2 = self._read_as_mvp_table(schema, self.field_groups[1])
72
- merged = MvpTable.merge(ds1, ds2, schema.primary_key.name)
73
- for i in range(2, len(self.field_groups)):
74
- ds_i = self._read_as_mvp_table(schema, self.field_groups[i])
75
- merged = MvpTable.merge(merged, ds_i, schema.primary_key.name)
76
- return merged
68
+ return self._read_as_mvp_table(schema)
77
69
 
78
- def _read_as_mvp_table(self, schema: Schema, field_group: FieldGroup):
70
+ def _read_as_mvp_table(self, schema: Schema):
79
71
  data = list(
80
72
  DataScan(
81
73
  schema, QueryExpression(), DatasetReader(self._metastore)
@@ -0,0 +1,7 @@
1
+ # TODO later on this will be moved to a dedicated package
2
+ from deltacat.experimental.storage.rivulet.feather.file_reader import FeatherFileReader
3
+ from deltacat.experimental.storage.rivulet.reader.reader_type_registrar import (
4
+ FileReaderRegistrar,
5
+ )
6
+
7
+ FileReaderRegistrar.register_reader("feather", FeatherFileReader)
@@ -5,15 +5,17 @@ from typing import Optional
5
5
  import pyarrow.ipc
6
6
  from pyarrow import RecordBatch, RecordBatchFileReader
7
7
 
8
- from deltacat.storage.rivulet.fs.file_provider import FileProvider
9
- from deltacat.storage.rivulet.metastore.sst import SSTableRow
10
- from deltacat.storage.rivulet.reader.data_reader import (
8
+ from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
9
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
10
+ from deltacat.experimental.storage.rivulet.reader.data_reader import (
11
11
  RowAndKey,
12
12
  FileReader,
13
13
  FILE_FORMAT,
14
14
  )
15
- from deltacat.storage.rivulet.reader.pyarrow_data_reader import RecordBatchRowIndex
16
- from deltacat.storage.rivulet.schema.schema import Schema
15
+ from deltacat.experimental.storage.rivulet.reader.pyarrow_data_reader import (
16
+ RecordBatchRowIndex,
17
+ )
18
+ from deltacat.experimental.storage.rivulet.schema.schema import Schema
17
19
 
18
20
 
19
21
  class FeatherFileReader(FileReader[RecordBatchRowIndex]):