deltacat 2.0.0b9__py3-none-any.whl → 2.0.0b10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. deltacat/__init__.py +27 -6
  2. deltacat/api.py +478 -123
  3. deltacat/aws/s3u.py +2 -2
  4. deltacat/benchmarking/conftest.py +1 -1
  5. deltacat/catalog/main/impl.py +12 -6
  6. deltacat/catalog/model/catalog.py +65 -47
  7. deltacat/catalog/model/properties.py +1 -3
  8. deltacat/compute/__init__.py +14 -0
  9. deltacat/compute/converter/constants.py +5 -0
  10. deltacat/compute/converter/converter_session.py +78 -36
  11. deltacat/compute/converter/model/convert_input.py +24 -4
  12. deltacat/compute/converter/model/convert_result.py +61 -0
  13. deltacat/compute/converter/model/converter_session_params.py +52 -10
  14. deltacat/compute/converter/pyiceberg/overrides.py +181 -62
  15. deltacat/compute/converter/steps/convert.py +84 -36
  16. deltacat/compute/converter/steps/dedupe.py +25 -4
  17. deltacat/compute/converter/utils/convert_task_options.py +42 -13
  18. deltacat/compute/converter/utils/iceberg_columns.py +5 -0
  19. deltacat/compute/converter/utils/io.py +82 -11
  20. deltacat/compute/converter/utils/s3u.py +13 -4
  21. deltacat/compute/jobs/__init__.py +0 -0
  22. deltacat/compute/jobs/client.py +404 -0
  23. deltacat/constants.py +4 -4
  24. deltacat/daft/daft_scan.py +7 -3
  25. deltacat/daft/translator.py +126 -0
  26. deltacat/examples/basic_logging.py +5 -3
  27. deltacat/examples/hello_world.py +4 -2
  28. deltacat/examples/indexer/__init__.py +0 -0
  29. deltacat/examples/indexer/aws/__init__.py +0 -0
  30. deltacat/examples/indexer/gcp/__init__.py +0 -0
  31. deltacat/examples/indexer/indexer.py +163 -0
  32. deltacat/examples/indexer/job_runner.py +199 -0
  33. deltacat/io/__init__.py +13 -0
  34. deltacat/io/dataset/__init__.py +0 -0
  35. deltacat/io/dataset/deltacat_dataset.py +91 -0
  36. deltacat/io/datasink/__init__.py +0 -0
  37. deltacat/io/datasink/deltacat_datasink.py +207 -0
  38. deltacat/io/datasource/__init__.py +0 -0
  39. deltacat/io/datasource/deltacat_datasource.py +580 -0
  40. deltacat/io/reader/__init__.py +0 -0
  41. deltacat/io/reader/deltacat_read_api.py +172 -0
  42. deltacat/storage/__init__.py +2 -0
  43. deltacat/storage/model/expression/__init__.py +47 -0
  44. deltacat/storage/model/expression/expression.py +656 -0
  45. deltacat/storage/model/expression/visitor.py +248 -0
  46. deltacat/storage/model/metafile.py +74 -42
  47. deltacat/storage/model/scan/push_down.py +32 -5
  48. deltacat/storage/model/types.py +5 -3
  49. deltacat/storage/rivulet/__init__.py +4 -4
  50. deltacat/tests/_io/reader/__init__.py +0 -0
  51. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  52. deltacat/tests/compute/converter/test_convert_session.py +209 -46
  53. deltacat/tests/local_deltacat_storage/__init__.py +1 -0
  54. deltacat/tests/storage/model/test_expression.py +327 -0
  55. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +2 -1
  56. deltacat/tests/storage/rivulet/test_dataset.py +1 -1
  57. deltacat/tests/storage/rivulet/test_manifest.py +1 -1
  58. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +1 -1
  59. deltacat/tests/test_deltacat_api.py +50 -9
  60. deltacat/types/media.py +141 -43
  61. deltacat/types/tables.py +35 -7
  62. deltacat/utils/daft.py +2 -2
  63. deltacat/utils/filesystem.py +39 -9
  64. deltacat/utils/polars.py +128 -0
  65. deltacat/utils/pyarrow.py +151 -15
  66. deltacat/utils/ray_utils/concurrency.py +1 -1
  67. deltacat/utils/ray_utils/runtime.py +56 -4
  68. deltacat/utils/url.py +1284 -0
  69. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/METADATA +9 -6
  70. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/RECORD +73 -48
  71. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/LICENSE +0 -0
  72. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/WHEEL +0 -0
  73. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/top_level.txt +0 -0
@@ -9,10 +9,11 @@ from daft.daft import (
9
9
  FileFormatConfig,
10
10
  ParquetSourceConfig,
11
11
  )
12
- from daft.io.scan import ScanOperator
12
+ from daft.io.scan import ScanOperator, ScanPushdowns
13
13
 
14
14
  from deltacat.catalog.model.table_definition import TableDefinition
15
15
  from deltacat.daft.model import DaftPartitionKeyMapper
16
+ from deltacat.daft.translator import translate_pushdown
16
17
 
17
18
 
18
19
  class DeltaCatScanOperator(ScanOperator):
@@ -44,8 +45,11 @@ class DeltaCatScanOperator(ScanOperator):
44
45
  ]
45
46
 
46
47
  def to_scan_tasks(self, pushdowns: Pushdowns) -> Iterator[ScanTask]:
47
- # TODO: implement pushdown predicate on DeltaCAT
48
- dc_scan_plan = self.table.create_scan_plan()
48
+ daft_pushdowns = ScanPushdowns._from_pypushdowns(
49
+ pushdowns, schema=self.schema()
50
+ )
51
+ dc_pushdown = translate_pushdown(daft_pushdowns)
52
+ dc_scan_plan = self.table.create_scan_plan(pushdown=dc_pushdown)
49
53
  scan_tasks = []
50
54
  file_format_config = FileFormatConfig.from_parquet_config(
51
55
  # maybe this: ParquetSourceConfig(field_id_mapping=self._field_id_mapping)
@@ -0,0 +1,126 @@
1
+ from daft.io.scan import ScanPushdowns
2
+ import pyarrow as pa
3
+ from typing import Callable, Dict
4
+ from daft.io.pushdowns import (
5
+ Expr as DaftExpr,
6
+ Literal as DaftLiteral,
7
+ Reference as DaftReference,
8
+ TermVisitor,
9
+ )
10
+
11
+ from deltacat.storage.model.expression import (
12
+ Expression,
13
+ Reference,
14
+ Literal,
15
+ Equal,
16
+ NotEqual,
17
+ GreaterThan,
18
+ LessThan,
19
+ GreaterThanEqual,
20
+ LessThanEqual,
21
+ And,
22
+ Or,
23
+ Not,
24
+ IsNull,
25
+ )
26
+ from deltacat.storage.model.scan.push_down import PartitionFilter, Pushdown
27
+
28
+
29
+ def translate_pushdown(pushdown: ScanPushdowns) -> Pushdown:
30
+ """
31
+ Helper method to translate a Daft ScanPushdowns object into a Deltacat Pushdown.
32
+
33
+ Args:
34
+ pushdown: Daft ScanPushdowns object
35
+
36
+ Returns:
37
+ Pushdown: Deltacat Pushdown object with translated filters
38
+ """
39
+ translator = DaftToDeltacatExpressionTranslator()
40
+ partition_filter = None
41
+
42
+ if pushdown.predicate:
43
+ predicate = translator.visit(pushdown.predicate, None)
44
+ partition_filter = PartitionFilter.of(predicate)
45
+
46
+ # TODO: translate other pushdown filters
47
+ return Pushdown.of(
48
+ row_filter=None,
49
+ column_filter=None,
50
+ partition_filter=partition_filter,
51
+ limit=None,
52
+ )
53
+
54
+
55
+ class DaftToDeltacatExpressionTranslator(TermVisitor[None, Expression]):
56
+ """
57
+ This visitor implementation traverses a Daft expression tree and produces
58
+ an equivalent Deltacat expression tree for use in Deltacat's query pushdown
59
+ system.
60
+ """
61
+
62
+ _PROCEDURES: Dict[str, Callable[..., Expression]] = {
63
+ # Comparison predicates
64
+ "=": Equal.of,
65
+ "!=": NotEqual.of,
66
+ "<": LessThan.of,
67
+ ">": GreaterThan.of,
68
+ "<=": LessThanEqual.of,
69
+ ">=": GreaterThanEqual.of,
70
+ # Logical predicates
71
+ "and": And.of,
72
+ "or": Or.of,
73
+ "not": Not.of,
74
+ # Special operations
75
+ "is_null": IsNull.of,
76
+ }
77
+
78
+ def visit_reference(self, term: DaftReference, context: None) -> Expression:
79
+ """
80
+ Convert Daft Reference to Deltacat Reference.
81
+
82
+ Args:
83
+ term: A Daft Reference expression representing a field or column.
84
+ context: Not used in this visitor implementation.
85
+
86
+ Returns:
87
+ DeltacatExpression: A Deltacat Reference expression for the same field.
88
+ """
89
+ return Reference(term.path)
90
+
91
+ def visit_literal(self, term: DaftLiteral, context: None) -> Expression:
92
+ """
93
+ Convert Daft Literal to Deltacat Literal.
94
+
95
+ Args:
96
+ term: A Daft Literal expression representing a constant value.
97
+ context: Not used in this visitor implementation.
98
+
99
+ Returns:
100
+ DeltacatExpression: A Deltacat Literal expression wrapping the same value as a PyArrow scalar.
101
+ """
102
+ return Literal(pa.scalar(term.value))
103
+
104
+ def visit_expr(self, term: DaftExpr, context: None) -> Expression:
105
+ """
106
+ This method handles the translation of procedure calls (operations) from
107
+ Daft to Deltacat, including special cases for IN, BETWEEN, and LIKE.
108
+
109
+ Args:
110
+ term: A Daft Expr expression representing an operation.
111
+ context: Not used in this visitor implementation.
112
+
113
+ Returns:
114
+ DeltacatExpression: An equivalent Deltacat expression.
115
+
116
+ Raises:
117
+ ValueError: If the operation has an invalid number of arguments or
118
+ if the operation is not supported by Deltacat.
119
+ """
120
+ proc = term.proc
121
+ args = [self.visit(arg.term, context) for arg in term.args]
122
+
123
+ if proc not in self._PROCEDURES:
124
+ raise ValueError(f"Deltacat does not support procedure '{proc}'.")
125
+
126
+ return self._PROCEDURES[proc](*args)
@@ -1,7 +1,9 @@
1
1
  import os
2
- import ray
2
+ import deltacat
3
3
  import logging
4
4
 
5
+ import ray
6
+
5
7
  from deltacat import logs
6
8
  from deltacat.constants import DELTACAT_APP_LOG_DIR, DELTACAT_SYS_LOG_DIR
7
9
  from deltacat.examples.common.fixtures import (
@@ -94,8 +96,8 @@ if __name__ == "__main__":
94
96
  # create any runtime environment required to run the example
95
97
  runtime_env = create_ray_runtime_environment()
96
98
 
97
- # initialize ray
98
- ray.init(runtime_env=runtime_env)
99
+ # initialize deltacat
100
+ deltacat.init(ray_init_args={"runtime_env": runtime_env})
99
101
 
100
102
  # run the example using os.environ as kwargs
101
103
  run(**os.environ)
@@ -1,12 +1,10 @@
1
1
  import ray
2
2
  import deltacat
3
3
  import daft
4
- import pyiceberg
5
4
 
6
5
 
7
6
  def print_package_version_info():
8
7
  print(f"DeltaCAT Version: {deltacat.__version__}")
9
- print(f"PyIceberg Version: {pyiceberg.__version__}")
10
8
  print(f"Ray Version: {ray.__version__}")
11
9
  print(f"Daft Version: {daft.__version__}")
12
10
 
@@ -24,4 +22,8 @@ def run():
24
22
 
25
23
 
26
24
  if __name__ == "__main__":
25
+ # initialize deltacat
26
+ deltacat.init()
27
+
28
+ # run the example
27
29
  run()
File without changes
File without changes
File without changes
@@ -0,0 +1,163 @@
1
+ import argparse
2
+
3
+ from datetime import datetime
4
+
5
+ import ray
6
+
7
+ import deltacat
8
+ import daft
9
+ import pyarrow as pa
10
+ import pandas as pd
11
+ import polars as pl
12
+ import numpy as np
13
+
14
+ from deltacat import DeltaCatUrl
15
+
16
+
17
+ def print_package_version_info() -> None:
18
+ print(f"DeltaCAT Version: {deltacat.__version__}")
19
+ print(f"Ray Version: {ray.__version__}")
20
+ print(f"Daft Version: {daft.__version__}")
21
+ print(f"NumPy Version: {np.__version__}")
22
+ print(f"PyArrow Version: {pa.__version__}")
23
+ print(f"Polars Version: {pl.__version__}")
24
+ print(f"Pandas Version: {pd.__version__}")
25
+
26
+
27
+ def json_path_to_regex(path: str):
28
+ if not path:
29
+ raise ValueError("Path cannot be empty")
30
+ parts = path.split("/")
31
+ leaf_key = parts.pop()
32
+ regex = r""
33
+ for part in parts:
34
+ if part.strip(): # discard leading and/or redundant separators
35
+ regex += rf'"{part}"\s*:\s*[{{\[].*?'
36
+ regex += rf'"{leaf_key}"\s*:\s*"(?<{leaf_key}>.*?)"'
37
+ return regex
38
+
39
+
40
+ def run(
41
+ source: str,
42
+ dest: str,
43
+ ) -> None:
44
+ # print package version info
45
+ print_package_version_info()
46
+
47
+ # run a synchronous copy from the source to the destination
48
+ deltacat.copy(
49
+ DeltaCatUrl(source),
50
+ DeltaCatUrl(dest),
51
+ # reader arguments to pass to the default reader (polars)
52
+ # for the given text-based datasource, it accepts the same
53
+ # arguments as polars.read_csv except for `source`, `n_threads`
54
+ # `new_columns`, `separator`, `has_header`, `quote_char`, and
55
+ # `infer_schema`.
56
+ reader_args={
57
+ "low_memory": True, # try to use less memory (++stability, --perf)
58
+ "batch_size": 1024, # text line count read into a buffer at once
59
+ "use_pyarrow": True, # use the native pyarrow reader
60
+ },
61
+ # writer arguments to pass to the default writer (polars)
62
+ # for the given parquet-based datasink, it accepts the same
63
+ # arguments as polars.DataFrame.write_parquet except for `file`
64
+ writer_args={
65
+ "compression": "lz4", # faster compression & decompression
66
+ # "compression": "zstd", # better compression ratio
67
+ # "compression": "snappy", # compatible w/ older Parquet readers
68
+ },
69
+ # Transforms to run against the default polars dataframe read.
70
+ # By default, each transform takes a polars dataframe `df` as input
71
+ # and produces a polars dataframe as output. All transforms listed
72
+ # are run in order (i.e., the dataframe output from transform[0]
73
+ # is the dataframe input to transform[1]).
74
+ #
75
+ # See:
76
+ # https://docs.pola.rs/api/python/stable/reference/dataframe/index.html
77
+ # https://docs.pola.rs/api/python/stable/reference/expressions/index.html
78
+ transforms=[
79
+ lambda df, src: df.rename(
80
+ {"text": "utf8_body"},
81
+ ),
82
+ lambda df, src: df.with_columns(
83
+ pl.col("utf8_body").hash().alias("utf8_body_hash"),
84
+ pl.lit(datetime.utcnow()).dt.datetime().alias("processing_time"),
85
+ pl.lit(src.url_path).alias("source_file_path"),
86
+ ),
87
+ ],
88
+ )
89
+
90
+
91
+ if __name__ == "__main__":
92
+ """
93
+ Example 1: Run this script locally using Ray:
94
+ $ python indexer.py \
95
+ $ --source 'text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31' \
96
+ $ --dest 'parquet+s3://deltacat-example-output/openalex/PaperAbstractsInvertedIndex.part31.parquet'
97
+
98
+ Example 2: Submit this script as a local Ray job using a local job client:
99
+ >>> from deltacat import local_job_client
100
+ >>> client = local_job_client()
101
+ >>> # read the source file as line-delimited text
102
+ >>> src = "text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31"
103
+ >>> # write to the destination file using the default DeltaCAT Parquet writer (i.e., polars.DataFrame.write_parquet)
104
+ >>> dst = "parquet+s3://deltacat-example-output/openalex/PaperAbstractsInvertedIndex.part31.parquet"
105
+ >>> try:
106
+ >>> job_run_result = client.run_job(
107
+ >>> # Entrypoint shell command to run the indexer job
108
+ >>> entrypoint=f"python indexer.py --source '{src}' --dest '{dst}'",
109
+ >>> # Path to the local directory that contains the indexer.py file
110
+ >>> runtime_env={"working_dir": "./deltacat/examples/indexer.py"},
111
+ >>> )
112
+ >>> print(f"Job ID {job_run_result.job_id} terminal state: {job_run_result.job_status}")
113
+ >>> print(f"Job ID {job_run_result.job_id} logs: ")
114
+ >>> print(job_run_result.job_logs)
115
+ >>> except RuntimeError as e:
116
+ >>> print(f"Job Run Failed: {e}")
117
+ >>> except TimeoutError as e:
118
+ >>> print(f"Job Run Timed Out: {e}")
119
+
120
+ Example 3: Submit this script as a remote Ray job using a remote job client:
121
+ >>> from deltacat import job_client
122
+ >>> # use `deltacat.yaml` from the current working directory as the ray cluster launcher config file
123
+ >>> # automatically launches the cluster if it doesn't exist or has died
124
+ >>> # automatically forwards the ray cluster's dashboard for viewing in a web browser @ http://localhost:8265
125
+ >>> client = job_client()
126
+ >>> # ... follow the same steps as above to submit a synchronous indexer job ...
127
+ >>>
128
+ >>> # OR use an explicit cluster launcher config file path
129
+ >>> client = job_client("/Users/pdames/workspace/deltacat.yaml")
130
+ >>> # ... follow the same steps as above to submit a synchronous indexer job ...
131
+ """
132
+ script_args = [
133
+ (
134
+ [
135
+ "--source",
136
+ ],
137
+ {
138
+ "help": "Source DeltaCAT URL to index.",
139
+ "type": str,
140
+ },
141
+ ),
142
+ (
143
+ [
144
+ "--dest",
145
+ ],
146
+ {
147
+ "help": "Destination DeltaCAT URL to index.",
148
+ "type": str,
149
+ },
150
+ ),
151
+ ]
152
+ # parse CLI input arguments
153
+ parser = argparse.ArgumentParser()
154
+ for args, kwargs in script_args:
155
+ parser.add_argument(*args, **kwargs)
156
+ args = parser.parse_args()
157
+ print(f"Command Line Arguments: {args}")
158
+
159
+ # initialize deltacat
160
+ deltacat.init()
161
+
162
+ # run the example using the parsed arguments
163
+ run(**vars(args))
@@ -0,0 +1,199 @@
1
+ import argparse
2
+ import pathlib
3
+
4
+ from deltacat.compute import (
5
+ job_client,
6
+ JobStatus,
7
+ )
8
+
9
+
10
+ def run_async(
11
+ source: str,
12
+ dest: str,
13
+ jobs_to_submit: int,
14
+ job_timeout: int,
15
+ cloud: str,
16
+ restart_ray: bool,
17
+ ):
18
+ # print package version info
19
+ working_dir = pathlib.Path(__file__).parent
20
+ cluster_cfg_file_path = working_dir.joinpath(cloud).joinpath("deltacat.yaml")
21
+ job_number = 0
22
+ client = job_client(cluster_cfg_file_path, restart_ray=restart_ray)
23
+ job_ids = []
24
+ while jobs_to_submit > 0:
25
+ jobs_to_submit -= 1
26
+ job_dest = dest + f".{job_number}"
27
+ job_id = client.submit_job(
28
+ # Entrypoint shell command to execute
29
+ entrypoint=f"python3 indexer.py --source '{source}' --dest '{job_dest}'",
30
+ # Path to the local directory that contains the indexer.py file
31
+ # This entire directory will be zipped into a job package, so keep
32
+ # it small.
33
+ runtime_env={"working_dir": working_dir},
34
+ )
35
+ job_ids.append(job_id)
36
+ job_number += 1
37
+
38
+ print("Waiting for all jobs to complete...")
39
+ job_number = 0
40
+ all_job_logs = ""
41
+ for job_id in job_ids:
42
+ job_status = client.await_job(job_id, timeout_seconds=job_timeout)
43
+ if job_status != JobStatus.SUCCEEDED:
44
+ print(f"Job `{job_id}` logs: ")
45
+ print(client.get_job_logs(job_id))
46
+ raise RuntimeError(f"Job `{job_id}` terminated with status: {job_status}")
47
+ all_job_logs += f"\nJob #{job_number} logs: \n"
48
+ all_job_logs += client.get_job_logs(job_id)
49
+ job_number += 1
50
+ print("All jobs completed!")
51
+ print("Job Logs: ")
52
+ print(all_job_logs)
53
+
54
+
55
+ def run_sync(
56
+ source: str,
57
+ dest: str,
58
+ jobs_to_submit: int,
59
+ job_timeout: int,
60
+ cloud: str,
61
+ restart_ray: bool,
62
+ ):
63
+ working_dir = pathlib.Path(__file__).parent
64
+ cluster_cfg_file_path = working_dir.joinpath(cloud).joinpath("deltacat.yaml")
65
+ client = job_client(cluster_cfg_file_path, restart_ray=restart_ray)
66
+ job_number = 0
67
+ while jobs_to_submit > 0:
68
+ jobs_to_submit -= 1
69
+ job_dest = dest + f".{job_number}"
70
+ job_run_result = client.run_job(
71
+ # Entrypoint shell command to execute
72
+ entrypoint=f"python3 indexer.py --source '{source}' --dest '{job_dest}'",
73
+ # Path to the local directory that contains the indexer.py file
74
+ # This entire directory will be zipped into a job package, so keep
75
+ # it small.
76
+ runtime_env={"working_dir": working_dir},
77
+ timeout_seconds=job_timeout,
78
+ )
79
+ print(
80
+ f"Job ID {job_run_result.job_id} terminal state: {job_run_result.job_status}"
81
+ )
82
+ print(f"Job ID {job_run_result.job_id} logs: ")
83
+ print(job_run_result.job_logs)
84
+ job_number += 1
85
+
86
+
87
+ def run(
88
+ source: str,
89
+ dest: str,
90
+ restart_ray: bool,
91
+ jobs_to_submit: int,
92
+ job_timeout: int,
93
+ asynchronous: bool,
94
+ cloud_provider: str,
95
+ ):
96
+ run_func = run_async if asynchronous else run_sync
97
+ run_func(
98
+ source=source,
99
+ dest=dest,
100
+ jobs_to_submit=jobs_to_submit,
101
+ job_timeout=job_timeout,
102
+ cloud=cloud_provider,
103
+ restart_ray=restart_ray,
104
+ )
105
+
106
+
107
+ if __name__ == "__main__":
108
+ """
109
+ # Run this example through a command of the form:
110
+ $ python ./deltacat/examples/job_runner.py -- \
111
+ $ --source text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31 \
112
+ $ --dest parquet+s3://deltacat-example-output/openalex/PaperAbstractsInvertedIndex.part31.parquet \
113
+ $ --asynchronous \
114
+ $ --jobs-to-submit 100 \
115
+ $ --job-timeout 90 \
116
+ $ --cloud-provider aws
117
+ """
118
+ script_args = [
119
+ (
120
+ [
121
+ "--source",
122
+ ],
123
+ {
124
+ "help": "Source DeltaCAT URL to index.",
125
+ "type": str,
126
+ "default": "text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31",
127
+ },
128
+ ),
129
+ (
130
+ [
131
+ "--dest",
132
+ ],
133
+ {
134
+ "help": "Destination DeltaCAT URL to store the indexed file.",
135
+ "type": str,
136
+ "default": "parquet+s3://deltacat-example-output/openalex/PaperAbstractsInvertedIndex.part31.parquet",
137
+ },
138
+ ),
139
+ (
140
+ [
141
+ "--restart-ray",
142
+ ],
143
+ {
144
+ "help": "Restart Ray on an existing cluster.",
145
+ "action": "store_true",
146
+ "default": False,
147
+ },
148
+ ),
149
+ (
150
+ [
151
+ "--asynchronous",
152
+ ],
153
+ {
154
+ "help": "Run jobs asynchronously.",
155
+ "action": "store_true",
156
+ "default": False,
157
+ },
158
+ ),
159
+ (
160
+ [
161
+ "--jobs-to-submit",
162
+ ],
163
+ {
164
+ "help": "Number of indexer jobs to submit for execution.",
165
+ "type": int,
166
+ "default": 1,
167
+ },
168
+ ),
169
+ (
170
+ [
171
+ "--job-timeout",
172
+ ],
173
+ {
174
+ "help": "Job timeout in seconds.",
175
+ "type": int,
176
+ "default": 300,
177
+ },
178
+ ),
179
+ (
180
+ [
181
+ "--cloud-provider",
182
+ ],
183
+ {
184
+ "help": "Ray Cluster Cloud Provider ('aws' or 'gcp')",
185
+ "type": str,
186
+ "default": "aws",
187
+ },
188
+ ),
189
+ ]
190
+
191
+ # parse CLI input arguments
192
+ parser = argparse.ArgumentParser()
193
+ for args, kwargs in script_args:
194
+ parser.add_argument(*args, **kwargs)
195
+ args = parser.parse_args()
196
+ print(f"Command Line Arguments: {args}")
197
+
198
+ # run the example using os.environ as kwargs
199
+ run(**vars(args))
deltacat/io/__init__.py CHANGED
@@ -0,0 +1,13 @@
1
+ from deltacat.io.reader.deltacat_read_api import read_deltacat
2
+ from deltacat.io.datasource.deltacat_datasource import DeltacatReadType
3
+ from deltacat.io.datasource.deltacat_datasource import (
4
+ METAFILE_DATA_COLUMN_NAME,
5
+ METAFILE_TYPE_COLUMN_NAME,
6
+ )
7
+
8
+ __all__ = [
9
+ "read_deltacat",
10
+ "DeltacatReadType",
11
+ "METAFILE_DATA_COLUMN_NAME",
12
+ "METAFILE_TYPE_COLUMN_NAME",
13
+ ]
File without changes
@@ -0,0 +1,91 @@
1
+ # Allow classes to use self-referencing Type hints in Python 3.7.
2
+ from __future__ import annotations
3
+
4
+ from typing import Any, Callable, Dict, Optional, cast
5
+
6
+ import pyarrow as pa
7
+ from ray.data import Dataset
8
+
9
+ from deltacat.utils.url import DeltaCatUrl
10
+ from deltacat.io.datasink.deltacat_datasink import DeltaCatDatasink
11
+
12
+
13
+ class DeltaCatDataset(Dataset):
14
+ @staticmethod
15
+ def from_dataset(dataset: Dataset) -> DeltaCatDataset:
16
+ # cast to DeltacatDataset in-place since it only adds new methods
17
+ dataset.__class__ = DeltaCatDataset
18
+ return cast(DeltaCatDataset, dataset)
19
+
20
+ def write_deltacat(
21
+ self,
22
+ url: DeltaCatUrl,
23
+ *,
24
+ # if the source dataset only contains DeltaCAT metadata, then only copy the metadata to the destination... if it contains external source file paths, then register them in a new Delta.
25
+ metadata_only: bool = False,
26
+ # merge all deltas as part of the write operation
27
+ copy_on_write: Optional[bool] = False,
28
+ filesystem: Optional[pa.fs.S3FileSystem] = None,
29
+ try_create_dir: bool = True,
30
+ arrow_open_stream_args: Optional[Dict[str, Any]] = None,
31
+ arrow_parquet_args_fn: Callable[[], Dict[str, Any]] = lambda: {},
32
+ min_rows_per_file: Optional[int] = None,
33
+ ray_remote_args: Dict[str, Any] = None,
34
+ concurrency: Optional[int] = None,
35
+ **arrow_parquet_args,
36
+ ) -> None:
37
+ """Writes the dataset to files and commits DeltaCAT metadata indexing
38
+ the files written.
39
+
40
+ This is only supported for datasets convertible to Arrow records.
41
+ To control the number of files, use ``.repartition()``.
42
+
43
+ Unless a custom block path provider is given, the format of the output
44
+ files will be {uuid}_{block_idx}.{extension}, where ``uuid`` is a
45
+ unique id for the dataset.
46
+
47
+ The DeltaCAT manifest will be written to ``f"{path}/manifest``
48
+
49
+ Examples:
50
+ >>> ds.write_deltacat("s3://catalog/root/path")
51
+
52
+ Time complexity: O(dataset size / parallelism)
53
+
54
+ Args:
55
+ url: The path to the root directory where materialized files and
56
+ DeltaCAT manifest will be written.
57
+ filesystem: The filesystem implementation to write to. This should
58
+ be either a PyArrow S3FileSystem.
59
+ try_create_dir: Try to create all directories in destination path
60
+ if True. Does nothing if all directories already exist.
61
+ arrow_open_stream_args: kwargs passed to
62
+ pyarrow.fs.S3FileSystem.open_output_stream
63
+ filename_provider: FilenameProvider implementation
64
+ to write each dataset block to a custom output path.
65
+ arrow_parquet_args_fn: Callable that returns a dictionary of write
66
+ arguments to use when writing each block to a file. Overrides
67
+ any duplicate keys from arrow_parquet_args. This should be used
68
+ instead of arrow_parquet_args if any of your write arguments
69
+ cannot be pickled, or if you'd like to lazily resolve the write
70
+ arguments for each dataset block.
71
+ arrow_parquet_args: Options to pass to
72
+ pyarrow.parquet.write_table(), which is used to write out each
73
+ block to a file.
74
+ """
75
+ datasink = DeltaCatDatasink(
76
+ url,
77
+ metadata_only=metadata_only,
78
+ copy_on_write=copy_on_write,
79
+ arrow_parquet_args_fn=arrow_parquet_args_fn,
80
+ arrow_parquet_args=arrow_parquet_args,
81
+ min_rows_per_file=min_rows_per_file,
82
+ filesystem=filesystem,
83
+ try_create_dir=try_create_dir,
84
+ open_stream_args=arrow_open_stream_args,
85
+ dataset_uuid=self._uuid,
86
+ )
87
+ self.write_datasink(
88
+ datasink,
89
+ ray_remote_args=ray_remote_args,
90
+ concurrency=concurrency,
91
+ )
File without changes