deltacat 2.0.0b7__py3-none-any.whl → 2.0.0b10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +27 -6
- deltacat/api.py +478 -123
- deltacat/aws/s3u.py +2 -2
- deltacat/benchmarking/conftest.py +1 -1
- deltacat/catalog/main/impl.py +12 -6
- deltacat/catalog/model/catalog.py +65 -47
- deltacat/catalog/model/properties.py +1 -3
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +78 -36
- deltacat/compute/converter/model/convert_input.py +24 -4
- deltacat/compute/converter/model/convert_result.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +52 -10
- deltacat/compute/converter/pyiceberg/overrides.py +181 -62
- deltacat/compute/converter/steps/convert.py +84 -36
- deltacat/compute/converter/steps/dedupe.py +25 -4
- deltacat/compute/converter/utils/convert_task_options.py +42 -13
- deltacat/compute/converter/utils/iceberg_columns.py +5 -0
- deltacat/compute/converter/utils/io.py +82 -11
- deltacat/compute/converter/utils/s3u.py +13 -4
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +404 -0
- deltacat/constants.py +4 -4
- deltacat/daft/daft_scan.py +7 -3
- deltacat/daft/translator.py +126 -0
- deltacat/examples/basic_logging.py +5 -3
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +199 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +580 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +2 -0
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/metafile.py +74 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/types.py +5 -3
- deltacat/storage/rivulet/__init__.py +4 -4
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/compute/converter/test_convert_session.py +209 -46
- deltacat/tests/local_deltacat_storage/__init__.py +1 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +2 -1
- deltacat/tests/storage/rivulet/test_dataset.py +1 -1
- deltacat/tests/storage/rivulet/test_manifest.py +1 -1
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +1 -1
- deltacat/tests/test_deltacat_api.py +50 -9
- deltacat/types/media.py +141 -43
- deltacat/types/tables.py +35 -7
- deltacat/utils/daft.py +2 -2
- deltacat/utils/filesystem.py +39 -9
- deltacat/utils/polars.py +128 -0
- deltacat/utils/pyarrow.py +151 -15
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/url.py +1284 -0
- {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/METADATA +9 -6
- {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/RECORD +73 -48
- {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/LICENSE +0 -0
- {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/WHEEL +0 -0
- {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/top_level.txt +0 -0
deltacat/daft/daft_scan.py
CHANGED
@@ -9,10 +9,11 @@ from daft.daft import (
|
|
9
9
|
FileFormatConfig,
|
10
10
|
ParquetSourceConfig,
|
11
11
|
)
|
12
|
-
from daft.io.scan import ScanOperator
|
12
|
+
from daft.io.scan import ScanOperator, ScanPushdowns
|
13
13
|
|
14
14
|
from deltacat.catalog.model.table_definition import TableDefinition
|
15
15
|
from deltacat.daft.model import DaftPartitionKeyMapper
|
16
|
+
from deltacat.daft.translator import translate_pushdown
|
16
17
|
|
17
18
|
|
18
19
|
class DeltaCatScanOperator(ScanOperator):
|
@@ -44,8 +45,11 @@ class DeltaCatScanOperator(ScanOperator):
|
|
44
45
|
]
|
45
46
|
|
46
47
|
def to_scan_tasks(self, pushdowns: Pushdowns) -> Iterator[ScanTask]:
|
47
|
-
|
48
|
-
|
48
|
+
daft_pushdowns = ScanPushdowns._from_pypushdowns(
|
49
|
+
pushdowns, schema=self.schema()
|
50
|
+
)
|
51
|
+
dc_pushdown = translate_pushdown(daft_pushdowns)
|
52
|
+
dc_scan_plan = self.table.create_scan_plan(pushdown=dc_pushdown)
|
49
53
|
scan_tasks = []
|
50
54
|
file_format_config = FileFormatConfig.from_parquet_config(
|
51
55
|
# maybe this: ParquetSourceConfig(field_id_mapping=self._field_id_mapping)
|
@@ -0,0 +1,126 @@
|
|
1
|
+
from daft.io.scan import ScanPushdowns
|
2
|
+
import pyarrow as pa
|
3
|
+
from typing import Callable, Dict
|
4
|
+
from daft.io.pushdowns import (
|
5
|
+
Expr as DaftExpr,
|
6
|
+
Literal as DaftLiteral,
|
7
|
+
Reference as DaftReference,
|
8
|
+
TermVisitor,
|
9
|
+
)
|
10
|
+
|
11
|
+
from deltacat.storage.model.expression import (
|
12
|
+
Expression,
|
13
|
+
Reference,
|
14
|
+
Literal,
|
15
|
+
Equal,
|
16
|
+
NotEqual,
|
17
|
+
GreaterThan,
|
18
|
+
LessThan,
|
19
|
+
GreaterThanEqual,
|
20
|
+
LessThanEqual,
|
21
|
+
And,
|
22
|
+
Or,
|
23
|
+
Not,
|
24
|
+
IsNull,
|
25
|
+
)
|
26
|
+
from deltacat.storage.model.scan.push_down import PartitionFilter, Pushdown
|
27
|
+
|
28
|
+
|
29
|
+
def translate_pushdown(pushdown: ScanPushdowns) -> Pushdown:
|
30
|
+
"""
|
31
|
+
Helper method to translate a Daft ScanPushdowns object into a Deltacat Pushdown.
|
32
|
+
|
33
|
+
Args:
|
34
|
+
pushdown: Daft ScanPushdowns object
|
35
|
+
|
36
|
+
Returns:
|
37
|
+
Pushdown: Deltacat Pushdown object with translated filters
|
38
|
+
"""
|
39
|
+
translator = DaftToDeltacatExpressionTranslator()
|
40
|
+
partition_filter = None
|
41
|
+
|
42
|
+
if pushdown.predicate:
|
43
|
+
predicate = translator.visit(pushdown.predicate, None)
|
44
|
+
partition_filter = PartitionFilter.of(predicate)
|
45
|
+
|
46
|
+
# TODO: translate other pushdown filters
|
47
|
+
return Pushdown.of(
|
48
|
+
row_filter=None,
|
49
|
+
column_filter=None,
|
50
|
+
partition_filter=partition_filter,
|
51
|
+
limit=None,
|
52
|
+
)
|
53
|
+
|
54
|
+
|
55
|
+
class DaftToDeltacatExpressionTranslator(TermVisitor[None, Expression]):
|
56
|
+
"""
|
57
|
+
This visitor implementation traverses a Daft expression tree and produces
|
58
|
+
an equivalent Deltacat expression tree for use in Deltacat's query pushdown
|
59
|
+
system.
|
60
|
+
"""
|
61
|
+
|
62
|
+
_PROCEDURES: Dict[str, Callable[..., Expression]] = {
|
63
|
+
# Comparison predicates
|
64
|
+
"=": Equal.of,
|
65
|
+
"!=": NotEqual.of,
|
66
|
+
"<": LessThan.of,
|
67
|
+
">": GreaterThan.of,
|
68
|
+
"<=": LessThanEqual.of,
|
69
|
+
">=": GreaterThanEqual.of,
|
70
|
+
# Logical predicates
|
71
|
+
"and": And.of,
|
72
|
+
"or": Or.of,
|
73
|
+
"not": Not.of,
|
74
|
+
# Special operations
|
75
|
+
"is_null": IsNull.of,
|
76
|
+
}
|
77
|
+
|
78
|
+
def visit_reference(self, term: DaftReference, context: None) -> Expression:
|
79
|
+
"""
|
80
|
+
Convert Daft Reference to Deltacat Reference.
|
81
|
+
|
82
|
+
Args:
|
83
|
+
term: A Daft Reference expression representing a field or column.
|
84
|
+
context: Not used in this visitor implementation.
|
85
|
+
|
86
|
+
Returns:
|
87
|
+
DeltacatExpression: A Deltacat Reference expression for the same field.
|
88
|
+
"""
|
89
|
+
return Reference(term.path)
|
90
|
+
|
91
|
+
def visit_literal(self, term: DaftLiteral, context: None) -> Expression:
|
92
|
+
"""
|
93
|
+
Convert Daft Literal to Deltacat Literal.
|
94
|
+
|
95
|
+
Args:
|
96
|
+
term: A Daft Literal expression representing a constant value.
|
97
|
+
context: Not used in this visitor implementation.
|
98
|
+
|
99
|
+
Returns:
|
100
|
+
DeltacatExpression: A Deltacat Literal expression wrapping the same value as a PyArrow scalar.
|
101
|
+
"""
|
102
|
+
return Literal(pa.scalar(term.value))
|
103
|
+
|
104
|
+
def visit_expr(self, term: DaftExpr, context: None) -> Expression:
|
105
|
+
"""
|
106
|
+
This method handles the translation of procedure calls (operations) from
|
107
|
+
Daft to Deltacat, including special cases for IN, BETWEEN, and LIKE.
|
108
|
+
|
109
|
+
Args:
|
110
|
+
term: A Daft Expr expression representing an operation.
|
111
|
+
context: Not used in this visitor implementation.
|
112
|
+
|
113
|
+
Returns:
|
114
|
+
DeltacatExpression: An equivalent Deltacat expression.
|
115
|
+
|
116
|
+
Raises:
|
117
|
+
ValueError: If the operation has an invalid number of arguments or
|
118
|
+
if the operation is not supported by Deltacat.
|
119
|
+
"""
|
120
|
+
proc = term.proc
|
121
|
+
args = [self.visit(arg.term, context) for arg in term.args]
|
122
|
+
|
123
|
+
if proc not in self._PROCEDURES:
|
124
|
+
raise ValueError(f"Deltacat does not support procedure '{proc}'.")
|
125
|
+
|
126
|
+
return self._PROCEDURES[proc](*args)
|
@@ -1,7 +1,9 @@
|
|
1
1
|
import os
|
2
|
-
import
|
2
|
+
import deltacat
|
3
3
|
import logging
|
4
4
|
|
5
|
+
import ray
|
6
|
+
|
5
7
|
from deltacat import logs
|
6
8
|
from deltacat.constants import DELTACAT_APP_LOG_DIR, DELTACAT_SYS_LOG_DIR
|
7
9
|
from deltacat.examples.common.fixtures import (
|
@@ -94,8 +96,8 @@ if __name__ == "__main__":
|
|
94
96
|
# create any runtime environment required to run the example
|
95
97
|
runtime_env = create_ray_runtime_environment()
|
96
98
|
|
97
|
-
# initialize
|
98
|
-
|
99
|
+
# initialize deltacat
|
100
|
+
deltacat.init(ray_init_args={"runtime_env": runtime_env})
|
99
101
|
|
100
102
|
# run the example using os.environ as kwargs
|
101
103
|
run(**os.environ)
|
deltacat/examples/hello_world.py
CHANGED
@@ -1,12 +1,10 @@
|
|
1
1
|
import ray
|
2
2
|
import deltacat
|
3
3
|
import daft
|
4
|
-
import pyiceberg
|
5
4
|
|
6
5
|
|
7
6
|
def print_package_version_info():
|
8
7
|
print(f"DeltaCAT Version: {deltacat.__version__}")
|
9
|
-
print(f"PyIceberg Version: {pyiceberg.__version__}")
|
10
8
|
print(f"Ray Version: {ray.__version__}")
|
11
9
|
print(f"Daft Version: {daft.__version__}")
|
12
10
|
|
@@ -24,4 +22,8 @@ def run():
|
|
24
22
|
|
25
23
|
|
26
24
|
if __name__ == "__main__":
|
25
|
+
# initialize deltacat
|
26
|
+
deltacat.init()
|
27
|
+
|
28
|
+
# run the example
|
27
29
|
run()
|
File without changes
|
File without changes
|
File without changes
|
@@ -0,0 +1,163 @@
|
|
1
|
+
import argparse
|
2
|
+
|
3
|
+
from datetime import datetime
|
4
|
+
|
5
|
+
import ray
|
6
|
+
|
7
|
+
import deltacat
|
8
|
+
import daft
|
9
|
+
import pyarrow as pa
|
10
|
+
import pandas as pd
|
11
|
+
import polars as pl
|
12
|
+
import numpy as np
|
13
|
+
|
14
|
+
from deltacat import DeltaCatUrl
|
15
|
+
|
16
|
+
|
17
|
+
def print_package_version_info() -> None:
|
18
|
+
print(f"DeltaCAT Version: {deltacat.__version__}")
|
19
|
+
print(f"Ray Version: {ray.__version__}")
|
20
|
+
print(f"Daft Version: {daft.__version__}")
|
21
|
+
print(f"NumPy Version: {np.__version__}")
|
22
|
+
print(f"PyArrow Version: {pa.__version__}")
|
23
|
+
print(f"Polars Version: {pl.__version__}")
|
24
|
+
print(f"Pandas Version: {pd.__version__}")
|
25
|
+
|
26
|
+
|
27
|
+
def json_path_to_regex(path: str):
|
28
|
+
if not path:
|
29
|
+
raise ValueError("Path cannot be empty")
|
30
|
+
parts = path.split("/")
|
31
|
+
leaf_key = parts.pop()
|
32
|
+
regex = r""
|
33
|
+
for part in parts:
|
34
|
+
if part.strip(): # discard leading and/or redundant separators
|
35
|
+
regex += rf'"{part}"\s*:\s*[{{\[].*?'
|
36
|
+
regex += rf'"{leaf_key}"\s*:\s*"(?<{leaf_key}>.*?)"'
|
37
|
+
return regex
|
38
|
+
|
39
|
+
|
40
|
+
def run(
|
41
|
+
source: str,
|
42
|
+
dest: str,
|
43
|
+
) -> None:
|
44
|
+
# print package version info
|
45
|
+
print_package_version_info()
|
46
|
+
|
47
|
+
# run a synchronous copy from the source to the destination
|
48
|
+
deltacat.copy(
|
49
|
+
DeltaCatUrl(source),
|
50
|
+
DeltaCatUrl(dest),
|
51
|
+
# reader arguments to pass to the default reader (polars)
|
52
|
+
# for the given text-based datasource, it accepts the same
|
53
|
+
# arguments as polars.read_csv except for `source`, `n_threads`
|
54
|
+
# `new_columns`, `separator`, `has_header`, `quote_char`, and
|
55
|
+
# `infer_schema`.
|
56
|
+
reader_args={
|
57
|
+
"low_memory": True, # try to use less memory (++stability, --perf)
|
58
|
+
"batch_size": 1024, # text line count read into a buffer at once
|
59
|
+
"use_pyarrow": True, # use the native pyarrow reader
|
60
|
+
},
|
61
|
+
# writer arguments to pass to the default writer (polars)
|
62
|
+
# for the given parquet-based datasink, it accepts the same
|
63
|
+
# arguments as polars.DataFrame.write_parquet except for `file`
|
64
|
+
writer_args={
|
65
|
+
"compression": "lz4", # faster compression & decompression
|
66
|
+
# "compression": "zstd", # better compression ratio
|
67
|
+
# "compression": "snappy", # compatible w/ older Parquet readers
|
68
|
+
},
|
69
|
+
# Transforms to run against the default polars dataframe read.
|
70
|
+
# By default, each transform takes a polars dataframe `df` as input
|
71
|
+
# and produces a polars dataframe as output. All transforms listed
|
72
|
+
# are run in order (i.e., the dataframe output from transform[0]
|
73
|
+
# is the dataframe input to transform[1]).
|
74
|
+
#
|
75
|
+
# See:
|
76
|
+
# https://docs.pola.rs/api/python/stable/reference/dataframe/index.html
|
77
|
+
# https://docs.pola.rs/api/python/stable/reference/expressions/index.html
|
78
|
+
transforms=[
|
79
|
+
lambda df, src: df.rename(
|
80
|
+
{"text": "utf8_body"},
|
81
|
+
),
|
82
|
+
lambda df, src: df.with_columns(
|
83
|
+
pl.col("utf8_body").hash().alias("utf8_body_hash"),
|
84
|
+
pl.lit(datetime.utcnow()).dt.datetime().alias("processing_time"),
|
85
|
+
pl.lit(src.url_path).alias("source_file_path"),
|
86
|
+
),
|
87
|
+
],
|
88
|
+
)
|
89
|
+
|
90
|
+
|
91
|
+
if __name__ == "__main__":
|
92
|
+
"""
|
93
|
+
Example 1: Run this script locally using Ray:
|
94
|
+
$ python indexer.py \
|
95
|
+
$ --source 'text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31' \
|
96
|
+
$ --dest 'parquet+s3://deltacat-example-output/openalex/PaperAbstractsInvertedIndex.part31.parquet'
|
97
|
+
|
98
|
+
Example 2: Submit this script as a local Ray job using a local job client:
|
99
|
+
>>> from deltacat import local_job_client
|
100
|
+
>>> client = local_job_client()
|
101
|
+
>>> # read the source file as line-delimited text
|
102
|
+
>>> src = "text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31"
|
103
|
+
>>> # write to the destination file using the default DeltaCAT Parquet writer (i.e., polars.DataFrame.write_parquet)
|
104
|
+
>>> dst = "parquet+s3://deltacat-example-output/openalex/PaperAbstractsInvertedIndex.part31.parquet"
|
105
|
+
>>> try:
|
106
|
+
>>> job_run_result = client.run_job(
|
107
|
+
>>> # Entrypoint shell command to run the indexer job
|
108
|
+
>>> entrypoint=f"python indexer.py --source '{src}' --dest '{dst}'",
|
109
|
+
>>> # Path to the local directory that contains the indexer.py file
|
110
|
+
>>> runtime_env={"working_dir": "./deltacat/examples/indexer.py"},
|
111
|
+
>>> )
|
112
|
+
>>> print(f"Job ID {job_run_result.job_id} terminal state: {job_run_result.job_status}")
|
113
|
+
>>> print(f"Job ID {job_run_result.job_id} logs: ")
|
114
|
+
>>> print(job_run_result.job_logs)
|
115
|
+
>>> except RuntimeError as e:
|
116
|
+
>>> print(f"Job Run Failed: {e}")
|
117
|
+
>>> except TimeoutError as e:
|
118
|
+
>>> print(f"Job Run Timed Out: {e}")
|
119
|
+
|
120
|
+
Example 3: Submit this script as a remote Ray job using a remote job client:
|
121
|
+
>>> from deltacat import job_client
|
122
|
+
>>> # use `deltacat.yaml` from the current working directory as the ray cluster launcher config file
|
123
|
+
>>> # automatically launches the cluster if it doesn't exist or has died
|
124
|
+
>>> # automatically forwards the ray cluster's dashboard for viewing in a web browser @ http://localhost:8265
|
125
|
+
>>> client = job_client()
|
126
|
+
>>> # ... follow the same steps as above to submit a synchronous indexer job ...
|
127
|
+
>>>
|
128
|
+
>>> # OR use an explicit cluster launcher config file path
|
129
|
+
>>> client = job_client("/Users/pdames/workspace/deltacat.yaml")
|
130
|
+
>>> # ... follow the same steps as above to submit a synchronous indexer job ...
|
131
|
+
"""
|
132
|
+
script_args = [
|
133
|
+
(
|
134
|
+
[
|
135
|
+
"--source",
|
136
|
+
],
|
137
|
+
{
|
138
|
+
"help": "Source DeltaCAT URL to index.",
|
139
|
+
"type": str,
|
140
|
+
},
|
141
|
+
),
|
142
|
+
(
|
143
|
+
[
|
144
|
+
"--dest",
|
145
|
+
],
|
146
|
+
{
|
147
|
+
"help": "Destination DeltaCAT URL to index.",
|
148
|
+
"type": str,
|
149
|
+
},
|
150
|
+
),
|
151
|
+
]
|
152
|
+
# parse CLI input arguments
|
153
|
+
parser = argparse.ArgumentParser()
|
154
|
+
for args, kwargs in script_args:
|
155
|
+
parser.add_argument(*args, **kwargs)
|
156
|
+
args = parser.parse_args()
|
157
|
+
print(f"Command Line Arguments: {args}")
|
158
|
+
|
159
|
+
# initialize deltacat
|
160
|
+
deltacat.init()
|
161
|
+
|
162
|
+
# run the example using the parsed arguments
|
163
|
+
run(**vars(args))
|
@@ -0,0 +1,199 @@
|
|
1
|
+
import argparse
|
2
|
+
import pathlib
|
3
|
+
|
4
|
+
from deltacat.compute import (
|
5
|
+
job_client,
|
6
|
+
JobStatus,
|
7
|
+
)
|
8
|
+
|
9
|
+
|
10
|
+
def run_async(
|
11
|
+
source: str,
|
12
|
+
dest: str,
|
13
|
+
jobs_to_submit: int,
|
14
|
+
job_timeout: int,
|
15
|
+
cloud: str,
|
16
|
+
restart_ray: bool,
|
17
|
+
):
|
18
|
+
# print package version info
|
19
|
+
working_dir = pathlib.Path(__file__).parent
|
20
|
+
cluster_cfg_file_path = working_dir.joinpath(cloud).joinpath("deltacat.yaml")
|
21
|
+
job_number = 0
|
22
|
+
client = job_client(cluster_cfg_file_path, restart_ray=restart_ray)
|
23
|
+
job_ids = []
|
24
|
+
while jobs_to_submit > 0:
|
25
|
+
jobs_to_submit -= 1
|
26
|
+
job_dest = dest + f".{job_number}"
|
27
|
+
job_id = client.submit_job(
|
28
|
+
# Entrypoint shell command to execute
|
29
|
+
entrypoint=f"python3 indexer.py --source '{source}' --dest '{job_dest}'",
|
30
|
+
# Path to the local directory that contains the indexer.py file
|
31
|
+
# This entire directory will be zipped into a job package, so keep
|
32
|
+
# it small.
|
33
|
+
runtime_env={"working_dir": working_dir},
|
34
|
+
)
|
35
|
+
job_ids.append(job_id)
|
36
|
+
job_number += 1
|
37
|
+
|
38
|
+
print("Waiting for all jobs to complete...")
|
39
|
+
job_number = 0
|
40
|
+
all_job_logs = ""
|
41
|
+
for job_id in job_ids:
|
42
|
+
job_status = client.await_job(job_id, timeout_seconds=job_timeout)
|
43
|
+
if job_status != JobStatus.SUCCEEDED:
|
44
|
+
print(f"Job `{job_id}` logs: ")
|
45
|
+
print(client.get_job_logs(job_id))
|
46
|
+
raise RuntimeError(f"Job `{job_id}` terminated with status: {job_status}")
|
47
|
+
all_job_logs += f"\nJob #{job_number} logs: \n"
|
48
|
+
all_job_logs += client.get_job_logs(job_id)
|
49
|
+
job_number += 1
|
50
|
+
print("All jobs completed!")
|
51
|
+
print("Job Logs: ")
|
52
|
+
print(all_job_logs)
|
53
|
+
|
54
|
+
|
55
|
+
def run_sync(
|
56
|
+
source: str,
|
57
|
+
dest: str,
|
58
|
+
jobs_to_submit: int,
|
59
|
+
job_timeout: int,
|
60
|
+
cloud: str,
|
61
|
+
restart_ray: bool,
|
62
|
+
):
|
63
|
+
working_dir = pathlib.Path(__file__).parent
|
64
|
+
cluster_cfg_file_path = working_dir.joinpath(cloud).joinpath("deltacat.yaml")
|
65
|
+
client = job_client(cluster_cfg_file_path, restart_ray=restart_ray)
|
66
|
+
job_number = 0
|
67
|
+
while jobs_to_submit > 0:
|
68
|
+
jobs_to_submit -= 1
|
69
|
+
job_dest = dest + f".{job_number}"
|
70
|
+
job_run_result = client.run_job(
|
71
|
+
# Entrypoint shell command to execute
|
72
|
+
entrypoint=f"python3 indexer.py --source '{source}' --dest '{job_dest}'",
|
73
|
+
# Path to the local directory that contains the indexer.py file
|
74
|
+
# This entire directory will be zipped into a job package, so keep
|
75
|
+
# it small.
|
76
|
+
runtime_env={"working_dir": working_dir},
|
77
|
+
timeout_seconds=job_timeout,
|
78
|
+
)
|
79
|
+
print(
|
80
|
+
f"Job ID {job_run_result.job_id} terminal state: {job_run_result.job_status}"
|
81
|
+
)
|
82
|
+
print(f"Job ID {job_run_result.job_id} logs: ")
|
83
|
+
print(job_run_result.job_logs)
|
84
|
+
job_number += 1
|
85
|
+
|
86
|
+
|
87
|
+
def run(
|
88
|
+
source: str,
|
89
|
+
dest: str,
|
90
|
+
restart_ray: bool,
|
91
|
+
jobs_to_submit: int,
|
92
|
+
job_timeout: int,
|
93
|
+
asynchronous: bool,
|
94
|
+
cloud_provider: str,
|
95
|
+
):
|
96
|
+
run_func = run_async if asynchronous else run_sync
|
97
|
+
run_func(
|
98
|
+
source=source,
|
99
|
+
dest=dest,
|
100
|
+
jobs_to_submit=jobs_to_submit,
|
101
|
+
job_timeout=job_timeout,
|
102
|
+
cloud=cloud_provider,
|
103
|
+
restart_ray=restart_ray,
|
104
|
+
)
|
105
|
+
|
106
|
+
|
107
|
+
if __name__ == "__main__":
|
108
|
+
"""
|
109
|
+
# Run this example through a command of the form:
|
110
|
+
$ python ./deltacat/examples/job_runner.py -- \
|
111
|
+
$ --source text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31 \
|
112
|
+
$ --dest parquet+s3://deltacat-example-output/openalex/PaperAbstractsInvertedIndex.part31.parquet \
|
113
|
+
$ --asynchronous \
|
114
|
+
$ --jobs-to-submit 100 \
|
115
|
+
$ --job-timeout 90 \
|
116
|
+
$ --cloud-provider aws
|
117
|
+
"""
|
118
|
+
script_args = [
|
119
|
+
(
|
120
|
+
[
|
121
|
+
"--source",
|
122
|
+
],
|
123
|
+
{
|
124
|
+
"help": "Source DeltaCAT URL to index.",
|
125
|
+
"type": str,
|
126
|
+
"default": "text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31",
|
127
|
+
},
|
128
|
+
),
|
129
|
+
(
|
130
|
+
[
|
131
|
+
"--dest",
|
132
|
+
],
|
133
|
+
{
|
134
|
+
"help": "Destination DeltaCAT URL to store the indexed file.",
|
135
|
+
"type": str,
|
136
|
+
"default": "parquet+s3://deltacat-example-output/openalex/PaperAbstractsInvertedIndex.part31.parquet",
|
137
|
+
},
|
138
|
+
),
|
139
|
+
(
|
140
|
+
[
|
141
|
+
"--restart-ray",
|
142
|
+
],
|
143
|
+
{
|
144
|
+
"help": "Restart Ray on an existing cluster.",
|
145
|
+
"action": "store_true",
|
146
|
+
"default": False,
|
147
|
+
},
|
148
|
+
),
|
149
|
+
(
|
150
|
+
[
|
151
|
+
"--asynchronous",
|
152
|
+
],
|
153
|
+
{
|
154
|
+
"help": "Run jobs asynchronously.",
|
155
|
+
"action": "store_true",
|
156
|
+
"default": False,
|
157
|
+
},
|
158
|
+
),
|
159
|
+
(
|
160
|
+
[
|
161
|
+
"--jobs-to-submit",
|
162
|
+
],
|
163
|
+
{
|
164
|
+
"help": "Number of indexer jobs to submit for execution.",
|
165
|
+
"type": int,
|
166
|
+
"default": 1,
|
167
|
+
},
|
168
|
+
),
|
169
|
+
(
|
170
|
+
[
|
171
|
+
"--job-timeout",
|
172
|
+
],
|
173
|
+
{
|
174
|
+
"help": "Job timeout in seconds.",
|
175
|
+
"type": int,
|
176
|
+
"default": 300,
|
177
|
+
},
|
178
|
+
),
|
179
|
+
(
|
180
|
+
[
|
181
|
+
"--cloud-provider",
|
182
|
+
],
|
183
|
+
{
|
184
|
+
"help": "Ray Cluster Cloud Provider ('aws' or 'gcp')",
|
185
|
+
"type": str,
|
186
|
+
"default": "aws",
|
187
|
+
},
|
188
|
+
),
|
189
|
+
]
|
190
|
+
|
191
|
+
# parse CLI input arguments
|
192
|
+
parser = argparse.ArgumentParser()
|
193
|
+
for args, kwargs in script_args:
|
194
|
+
parser.add_argument(*args, **kwargs)
|
195
|
+
args = parser.parse_args()
|
196
|
+
print(f"Command Line Arguments: {args}")
|
197
|
+
|
198
|
+
# run the example using os.environ as kwargs
|
199
|
+
run(**vars(args))
|
deltacat/io/__init__.py
CHANGED
@@ -0,0 +1,13 @@
|
|
1
|
+
from deltacat.io.reader.deltacat_read_api import read_deltacat
|
2
|
+
from deltacat.io.datasource.deltacat_datasource import DeltacatReadType
|
3
|
+
from deltacat.io.datasource.deltacat_datasource import (
|
4
|
+
METAFILE_DATA_COLUMN_NAME,
|
5
|
+
METAFILE_TYPE_COLUMN_NAME,
|
6
|
+
)
|
7
|
+
|
8
|
+
__all__ = [
|
9
|
+
"read_deltacat",
|
10
|
+
"DeltacatReadType",
|
11
|
+
"METAFILE_DATA_COLUMN_NAME",
|
12
|
+
"METAFILE_TYPE_COLUMN_NAME",
|
13
|
+
]
|
File without changes
|
@@ -0,0 +1,91 @@
|
|
1
|
+
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
from typing import Any, Callable, Dict, Optional, cast
|
5
|
+
|
6
|
+
import pyarrow as pa
|
7
|
+
from ray.data import Dataset
|
8
|
+
|
9
|
+
from deltacat.utils.url import DeltaCatUrl
|
10
|
+
from deltacat.io.datasink.deltacat_datasink import DeltaCatDatasink
|
11
|
+
|
12
|
+
|
13
|
+
class DeltaCatDataset(Dataset):
|
14
|
+
@staticmethod
|
15
|
+
def from_dataset(dataset: Dataset) -> DeltaCatDataset:
|
16
|
+
# cast to DeltacatDataset in-place since it only adds new methods
|
17
|
+
dataset.__class__ = DeltaCatDataset
|
18
|
+
return cast(DeltaCatDataset, dataset)
|
19
|
+
|
20
|
+
def write_deltacat(
|
21
|
+
self,
|
22
|
+
url: DeltaCatUrl,
|
23
|
+
*,
|
24
|
+
# if the source dataset only contains DeltaCAT metadata, then only copy the metadata to the destination... if it contains external source file paths, then register them in a new Delta.
|
25
|
+
metadata_only: bool = False,
|
26
|
+
# merge all deltas as part of the write operation
|
27
|
+
copy_on_write: Optional[bool] = False,
|
28
|
+
filesystem: Optional[pa.fs.S3FileSystem] = None,
|
29
|
+
try_create_dir: bool = True,
|
30
|
+
arrow_open_stream_args: Optional[Dict[str, Any]] = None,
|
31
|
+
arrow_parquet_args_fn: Callable[[], Dict[str, Any]] = lambda: {},
|
32
|
+
min_rows_per_file: Optional[int] = None,
|
33
|
+
ray_remote_args: Dict[str, Any] = None,
|
34
|
+
concurrency: Optional[int] = None,
|
35
|
+
**arrow_parquet_args,
|
36
|
+
) -> None:
|
37
|
+
"""Writes the dataset to files and commits DeltaCAT metadata indexing
|
38
|
+
the files written.
|
39
|
+
|
40
|
+
This is only supported for datasets convertible to Arrow records.
|
41
|
+
To control the number of files, use ``.repartition()``.
|
42
|
+
|
43
|
+
Unless a custom block path provider is given, the format of the output
|
44
|
+
files will be {uuid}_{block_idx}.{extension}, where ``uuid`` is a
|
45
|
+
unique id for the dataset.
|
46
|
+
|
47
|
+
The DeltaCAT manifest will be written to ``f"{path}/manifest``
|
48
|
+
|
49
|
+
Examples:
|
50
|
+
>>> ds.write_deltacat("s3://catalog/root/path")
|
51
|
+
|
52
|
+
Time complexity: O(dataset size / parallelism)
|
53
|
+
|
54
|
+
Args:
|
55
|
+
url: The path to the root directory where materialized files and
|
56
|
+
DeltaCAT manifest will be written.
|
57
|
+
filesystem: The filesystem implementation to write to. This should
|
58
|
+
be either a PyArrow S3FileSystem.
|
59
|
+
try_create_dir: Try to create all directories in destination path
|
60
|
+
if True. Does nothing if all directories already exist.
|
61
|
+
arrow_open_stream_args: kwargs passed to
|
62
|
+
pyarrow.fs.S3FileSystem.open_output_stream
|
63
|
+
filename_provider: FilenameProvider implementation
|
64
|
+
to write each dataset block to a custom output path.
|
65
|
+
arrow_parquet_args_fn: Callable that returns a dictionary of write
|
66
|
+
arguments to use when writing each block to a file. Overrides
|
67
|
+
any duplicate keys from arrow_parquet_args. This should be used
|
68
|
+
instead of arrow_parquet_args if any of your write arguments
|
69
|
+
cannot be pickled, or if you'd like to lazily resolve the write
|
70
|
+
arguments for each dataset block.
|
71
|
+
arrow_parquet_args: Options to pass to
|
72
|
+
pyarrow.parquet.write_table(), which is used to write out each
|
73
|
+
block to a file.
|
74
|
+
"""
|
75
|
+
datasink = DeltaCatDatasink(
|
76
|
+
url,
|
77
|
+
metadata_only=metadata_only,
|
78
|
+
copy_on_write=copy_on_write,
|
79
|
+
arrow_parquet_args_fn=arrow_parquet_args_fn,
|
80
|
+
arrow_parquet_args=arrow_parquet_args,
|
81
|
+
min_rows_per_file=min_rows_per_file,
|
82
|
+
filesystem=filesystem,
|
83
|
+
try_create_dir=try_create_dir,
|
84
|
+
open_stream_args=arrow_open_stream_args,
|
85
|
+
dataset_uuid=self._uuid,
|
86
|
+
)
|
87
|
+
self.write_datasink(
|
88
|
+
datasink,
|
89
|
+
ray_remote_args=ray_remote_args,
|
90
|
+
concurrency=concurrency,
|
91
|
+
)
|
File without changes
|