datapipe-core 0.14.2.dev2__tar.gz → 0.15.0.dev1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/.gitignore +0 -1
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/PKG-INFO +1 -3
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/store/database.py +10 -18
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/store/filedir.py +8 -17
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/store/redis.py +4 -20
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/store/table_store.py +0 -20
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/pyproject.toml +1 -5
- datapipe_core-0.14.2.dev2/datapipe/store/elastic.py +0 -211
- datapipe_core-0.14.2.dev2/datapipe/store/tests/__init__.py +0 -0
- datapipe_core-0.14.2.dev2/datapipe/store/tests/abstract.py +0 -307
- datapipe_core-0.14.2.dev2/datapipe/store/tests/stubs.py +0 -89
- datapipe_core-0.14.2.dev2/datapipe/tests/__init__.py +0 -0
- datapipe_core-0.14.2.dev2/datapipe/tests/util.py +0 -59
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/LICENSE +0 -0
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/README.md +0 -0
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/__init__.py +0 -0
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/cli.py +0 -0
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/compute.py +0 -0
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/datatable.py +0 -0
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/event_logger.py +0 -0
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/executor/__init__.py +0 -0
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/executor/ray.py +0 -0
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/lints.py +0 -0
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/meta/__init__.py +0 -0
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/meta/sql_meta.py +0 -0
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/migrations/__init__.py +0 -0
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/migrations/v013.py +0 -0
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/py.typed +0 -0
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/run_config.py +0 -0
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/sql_util.py +0 -0
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/step/__init__.py +0 -0
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/step/batch_generate.py +0 -0
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/step/batch_transform.py +0 -0
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/step/datatable_transform.py +0 -0
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/step/update_external_table.py +0 -0
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/store/__init__.py +0 -0
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/store/milvus.py +0 -0
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/store/pandas.py +0 -0
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/store/qdrant.py +0 -0
- {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/types.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datapipe-core
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.15.0.dev1
|
|
4
4
|
Summary: `datapipe` is a realtime incremental ETL library for Python application
|
|
5
5
|
Project-URL: Repository, https://github.com/epoch8/datapipe
|
|
6
6
|
Author-email: Andrey Tatarinov <a@tatarinov.co>
|
|
@@ -24,8 +24,6 @@ Requires-Dist: sqlalchemy<3.0.0,>=2.0.0
|
|
|
24
24
|
Requires-Dist: tqdm-loggable<0.3,>=0.2
|
|
25
25
|
Requires-Dist: traceback-with-variables<3,>=2.0.4
|
|
26
26
|
Requires-Dist: types-tqdm>=4.67.0.20241221
|
|
27
|
-
Provides-Extra: elastic
|
|
28
|
-
Requires-Dist: elasticsearch>=8.17.1; extra == 'elastic'
|
|
29
27
|
Provides-Extra: excel
|
|
30
28
|
Requires-Dist: openpyxl>=3.0.7; extra == 'excel'
|
|
31
29
|
Requires-Dist: xlrd>=2.0.1; extra == 'excel'
|
|
@@ -14,7 +14,7 @@ from sqlalchemy.sql.expression import delete, select
|
|
|
14
14
|
|
|
15
15
|
from datapipe.run_config import RunConfig
|
|
16
16
|
from datapipe.sql_util import sql_apply_idx_filter_to_table, sql_apply_runconfig_filter
|
|
17
|
-
from datapipe.store.table_store import TableStore
|
|
17
|
+
from datapipe.store.table_store import TableStore
|
|
18
18
|
from datapipe.types import DataDF, DataSchema, IndexDF, MetaSchema, OrmTable, TAnyDF
|
|
19
19
|
|
|
20
20
|
logger = logging.getLogger("datapipe.store.database")
|
|
@@ -121,14 +121,6 @@ class MetaKey(SchemaItem):
|
|
|
121
121
|
|
|
122
122
|
|
|
123
123
|
class TableStoreDB(TableStore):
|
|
124
|
-
caps = TableStoreCaps(
|
|
125
|
-
supports_delete=True,
|
|
126
|
-
supports_get_schema=True,
|
|
127
|
-
supports_read_all_rows=True,
|
|
128
|
-
supports_read_nonexistent_rows=True,
|
|
129
|
-
supports_read_meta_pseudo_df=True,
|
|
130
|
-
)
|
|
131
|
-
|
|
132
124
|
def __init__(
|
|
133
125
|
self,
|
|
134
126
|
dbconn: Union["DBConn", str],
|
|
@@ -144,9 +136,9 @@ class TableStoreDB(TableStore):
|
|
|
144
136
|
|
|
145
137
|
if orm_table is not None:
|
|
146
138
|
assert name is None, "name should be None if orm_table is provided"
|
|
147
|
-
assert
|
|
148
|
-
|
|
149
|
-
)
|
|
139
|
+
assert (
|
|
140
|
+
data_sql_schema is None
|
|
141
|
+
), "data_sql_schema should be None if orm_table is provided"
|
|
150
142
|
|
|
151
143
|
orm_table__table = orm_table.__table__ # type: ignore
|
|
152
144
|
self.data_table = cast(Table, orm_table__table)
|
|
@@ -169,12 +161,12 @@ class TableStoreDB(TableStore):
|
|
|
169
161
|
]
|
|
170
162
|
|
|
171
163
|
else:
|
|
172
|
-
assert
|
|
173
|
-
|
|
174
|
-
)
|
|
175
|
-
assert
|
|
176
|
-
|
|
177
|
-
)
|
|
164
|
+
assert (
|
|
165
|
+
name is not None
|
|
166
|
+
), "name should be provided if data_table is not provided"
|
|
167
|
+
assert (
|
|
168
|
+
data_sql_schema is not None
|
|
169
|
+
), "data_sql_schema should be provided if data_table is not provided"
|
|
178
170
|
|
|
179
171
|
self.name = name
|
|
180
172
|
|
|
@@ -15,7 +15,7 @@ from PIL import Image
|
|
|
15
15
|
from sqlalchemy import Column, Integer, String
|
|
16
16
|
|
|
17
17
|
from datapipe.run_config import RunConfig
|
|
18
|
-
from datapipe.store.table_store import TableStore
|
|
18
|
+
from datapipe.store.table_store import TableStore
|
|
19
19
|
from datapipe.types import DataDF, DataSchema, IndexDF, MetaSchema
|
|
20
20
|
|
|
21
21
|
|
|
@@ -103,9 +103,9 @@ def _pattern_to_attrnames(pat: str) -> List[str]:
|
|
|
103
103
|
assert len(attrnames) > 0, "The scheme is not valid."
|
|
104
104
|
if len(attrnames) >= 2:
|
|
105
105
|
duplicates_attrnames = list(duplicates(attrnames))
|
|
106
|
-
assert
|
|
107
|
-
|
|
108
|
-
)
|
|
106
|
+
assert (
|
|
107
|
+
len(duplicates_attrnames) == 0
|
|
108
|
+
), f"Some keys are repeated: {duplicates_attrnames}. Rename them."
|
|
109
109
|
|
|
110
110
|
return attrnames
|
|
111
111
|
|
|
@@ -161,14 +161,6 @@ class Replacer:
|
|
|
161
161
|
|
|
162
162
|
|
|
163
163
|
class TableStoreFiledir(TableStore):
|
|
164
|
-
caps = TableStoreCaps(
|
|
165
|
-
supports_delete=True,
|
|
166
|
-
supports_get_schema=False,
|
|
167
|
-
supports_read_all_rows=True,
|
|
168
|
-
supports_read_nonexistent_rows=False,
|
|
169
|
-
supports_read_meta_pseudo_df=True,
|
|
170
|
-
)
|
|
171
|
-
|
|
172
164
|
def __init__(
|
|
173
165
|
self,
|
|
174
166
|
filename_pattern: Union[str, Path],
|
|
@@ -286,8 +278,7 @@ class TableStoreFiledir(TableStore):
|
|
|
286
278
|
for attrname in self.attrnames
|
|
287
279
|
]
|
|
288
280
|
self.attrname_to_cls = {
|
|
289
|
-
column.name: type_to_cls[type(column.type)]
|
|
290
|
-
for column in self.primary_schema # type: ignore
|
|
281
|
+
column.name: type_to_cls[type(column.type)] for column in self.primary_schema # type: ignore
|
|
291
282
|
}
|
|
292
283
|
|
|
293
284
|
def get_primary_schema(self) -> DataSchema:
|
|
@@ -331,9 +322,9 @@ class TableStoreFiledir(TableStore):
|
|
|
331
322
|
"""
|
|
332
323
|
_, filepath = fsspec.core.split_protocol(filepath)
|
|
333
324
|
m = re.match(self.filename_match, filepath)
|
|
334
|
-
assert
|
|
335
|
-
|
|
336
|
-
)
|
|
325
|
+
assert (
|
|
326
|
+
m is not None
|
|
327
|
+
), f"Filepath {filepath} does not match the pattern {self.filename_match}"
|
|
337
328
|
|
|
338
329
|
data = {}
|
|
339
330
|
for attrname in self.attrnames:
|
|
@@ -7,7 +7,7 @@ from redis.cluster import RedisCluster
|
|
|
7
7
|
from sqlalchemy import Column
|
|
8
8
|
|
|
9
9
|
from datapipe.store.database import MetaKey
|
|
10
|
-
from datapipe.store.table_store import TableStore
|
|
10
|
+
from datapipe.store.table_store import TableStore
|
|
11
11
|
from datapipe.types import DataDF, DataSchema, IndexDF, MetaSchema, data_to_index
|
|
12
12
|
|
|
13
13
|
|
|
@@ -24,30 +24,14 @@ def _to_itertuples(df: DataDF, colnames):
|
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
class RedisStore(TableStore):
|
|
27
|
-
caps = TableStoreCaps(
|
|
28
|
-
supports_delete=True,
|
|
29
|
-
supports_get_schema=False,
|
|
30
|
-
supports_read_all_rows=False,
|
|
31
|
-
supports_read_nonexistent_rows=False, # TODO check
|
|
32
|
-
supports_read_meta_pseudo_df=False,
|
|
33
|
-
)
|
|
34
|
-
|
|
35
27
|
def __init__(
|
|
36
|
-
self,
|
|
37
|
-
connection: str,
|
|
38
|
-
name: str,
|
|
39
|
-
data_sql_schema: List[Column],
|
|
40
|
-
cluster_mode: bool = False,
|
|
28
|
+
self, connection: str, name: str, data_sql_schema: List[Column], cluster_mode: bool = False
|
|
41
29
|
) -> None:
|
|
42
30
|
self.connection = connection
|
|
43
31
|
if not cluster_mode:
|
|
44
|
-
self.redis_connection: Union[Redis, RedisCluster] = Redis.from_url(
|
|
45
|
-
connection, decode_responses=True
|
|
46
|
-
)
|
|
32
|
+
self.redis_connection: Union[Redis, RedisCluster] = Redis.from_url(connection, decode_responses=True)
|
|
47
33
|
else:
|
|
48
|
-
self.redis_connection = RedisCluster.from_url(
|
|
49
|
-
connection, decode_responses=True
|
|
50
|
-
)
|
|
34
|
+
self.redis_connection = RedisCluster.from_url(connection, decode_responses=True)
|
|
51
35
|
|
|
52
36
|
self.name = name
|
|
53
37
|
self.data_sql_schema = data_sql_schema
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from abc import ABC
|
|
2
|
-
from dataclasses import dataclass
|
|
3
2
|
from pathlib import Path
|
|
4
3
|
from typing import Iterator, List, Optional, Union
|
|
5
4
|
|
|
@@ -10,18 +9,7 @@ from datapipe.run_config import RunConfig
|
|
|
10
9
|
from datapipe.types import DataDF, DataSchema, IndexDF, MetaSchema, data_to_index
|
|
11
10
|
|
|
12
11
|
|
|
13
|
-
@dataclass
|
|
14
|
-
class TableStoreCaps:
|
|
15
|
-
supports_delete: bool
|
|
16
|
-
supports_get_schema: bool
|
|
17
|
-
supports_read_all_rows: bool
|
|
18
|
-
supports_read_nonexistent_rows: bool
|
|
19
|
-
supports_read_meta_pseudo_df: bool
|
|
20
|
-
|
|
21
|
-
|
|
22
12
|
class TableStore(ABC):
|
|
23
|
-
caps: TableStoreCaps
|
|
24
|
-
|
|
25
13
|
def get_primary_schema(self) -> DataSchema:
|
|
26
14
|
raise NotImplementedError
|
|
27
15
|
|
|
@@ -58,14 +46,6 @@ class TableStore(ABC):
|
|
|
58
46
|
|
|
59
47
|
|
|
60
48
|
class TableDataSingleFileStore(TableStore):
|
|
61
|
-
caps = TableStoreCaps(
|
|
62
|
-
supports_delete=True,
|
|
63
|
-
supports_get_schema=False,
|
|
64
|
-
supports_read_all_rows=True,
|
|
65
|
-
supports_read_nonexistent_rows=True,
|
|
66
|
-
supports_read_meta_pseudo_df=True,
|
|
67
|
-
)
|
|
68
|
-
|
|
69
49
|
def __init__(
|
|
70
50
|
self,
|
|
71
51
|
filename: Union[Path, str, None] = None,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "datapipe-core"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.15.0-dev.1"
|
|
4
4
|
description = "`datapipe` is a realtime incremental ETL library for Python application"
|
|
5
5
|
authors = [{ name = "Andrey Tatarinov", email = "a@tatarinov.co" }]
|
|
6
6
|
readme = "README.md"
|
|
@@ -40,7 +40,6 @@ redis = ["redis>=4.3.4,<5"]
|
|
|
40
40
|
qdrant = ["qdrant-client>=1.1.7,<2"]
|
|
41
41
|
ray = ["ray[default]>=2.5.0,<3"]
|
|
42
42
|
gcp = ["opentelemetry-exporter-gcp-trace"]
|
|
43
|
-
elastic = ["elasticsearch>=8.17.1"]
|
|
44
43
|
|
|
45
44
|
[project.urls]
|
|
46
45
|
Repository = "https://github.com/epoch8/datapipe"
|
|
@@ -63,9 +62,6 @@ dev = [
|
|
|
63
62
|
"types-PyYAML",
|
|
64
63
|
]
|
|
65
64
|
|
|
66
|
-
[tool.ruff]
|
|
67
|
-
line-length = 120
|
|
68
|
-
|
|
69
65
|
[tool.hatch.build.targets.sdist]
|
|
70
66
|
include = ["datapipe", "datapipe/py.typed"]
|
|
71
67
|
|
|
@@ -1,211 +0,0 @@
|
|
|
1
|
-
import base64
|
|
2
|
-
import hashlib
|
|
3
|
-
from typing import Any, Dict, Iterable, Iterator, List, Optional, TypedDict
|
|
4
|
-
|
|
5
|
-
import pandas as pd
|
|
6
|
-
from elastic_transport import ObjectApiResponse
|
|
7
|
-
from elasticsearch import Elasticsearch, helpers
|
|
8
|
-
from sqlalchemy import Column
|
|
9
|
-
|
|
10
|
-
from datapipe.run_config import RunConfig
|
|
11
|
-
from datapipe.store.database import MetaKey
|
|
12
|
-
from datapipe.store.table_store import TableStore, TableStoreCaps
|
|
13
|
-
from datapipe.types import DataDF, DataSchema, IndexDF, MetaSchema
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def get_elastic_id(keys: Iterable[Any], length: int = 20) -> str:
|
|
17
|
-
concatenated_keys = "".join([str(key) for key in keys])
|
|
18
|
-
needed_bytes = length * 3 // 4
|
|
19
|
-
hash_object = hashlib.sha256(concatenated_keys.encode("utf-8"))
|
|
20
|
-
hash_bytes = hash_object.digest()[:needed_bytes]
|
|
21
|
-
base64_encoded_id = base64.urlsafe_b64encode(hash_bytes).decode("utf-8")
|
|
22
|
-
return base64_encoded_id[:length]
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def _to_itertuples(df: DataDF, colnames):
|
|
26
|
-
return list(df[colnames].itertuples(index=False, name=None))
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def remap_dict_keys(data: Dict[str, Any], key_name_remapping: Dict[str, str]) -> Dict[str, Any]:
|
|
30
|
-
return {key_name_remapping.get(key, key): value for key, value in data.items()}
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
class ElasticStoreState(TypedDict):
|
|
34
|
-
index: str
|
|
35
|
-
data_sql_schema: List[Column]
|
|
36
|
-
es_kwargs: Dict[str, Any]
|
|
37
|
-
key_name_remapping: Optional[Dict[str, str]]
|
|
38
|
-
mapping: Optional[dict]
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
class ElasticStore(TableStore):
|
|
42
|
-
caps = TableStoreCaps(
|
|
43
|
-
supports_delete=True,
|
|
44
|
-
supports_read_all_rows=True,
|
|
45
|
-
supports_get_schema=True,
|
|
46
|
-
supports_read_meta_pseudo_df=True,
|
|
47
|
-
supports_read_nonexistent_rows=False,
|
|
48
|
-
)
|
|
49
|
-
|
|
50
|
-
def __init__(
|
|
51
|
-
self,
|
|
52
|
-
index: str,
|
|
53
|
-
data_sql_schema: List[Column],
|
|
54
|
-
es_kwargs: Dict[str, Any],
|
|
55
|
-
key_name_remapping: Optional[Dict[str, str]] = None,
|
|
56
|
-
mapping: Optional[dict] = None,
|
|
57
|
-
) -> None:
|
|
58
|
-
self.index = index
|
|
59
|
-
self.data_sql_schema = data_sql_schema
|
|
60
|
-
self.key_name_remapping = key_name_remapping or {}
|
|
61
|
-
self.primary_key_columns = [column.name for column in self.data_sql_schema if column.primary_key]
|
|
62
|
-
self.value_key_columns = [column.name for column in self.data_sql_schema if not column.primary_key]
|
|
63
|
-
self.primary_key_column_rename = "_dtp_orig_{pk}"
|
|
64
|
-
self.mapping = mapping
|
|
65
|
-
|
|
66
|
-
self.es_kwargs = es_kwargs
|
|
67
|
-
self.es_client = Elasticsearch(**es_kwargs)
|
|
68
|
-
|
|
69
|
-
def __getstate__(self) -> ElasticStoreState:
|
|
70
|
-
return {
|
|
71
|
-
"index": self.index,
|
|
72
|
-
"data_sql_schema": self.data_sql_schema,
|
|
73
|
-
"es_kwargs": self.es_kwargs,
|
|
74
|
-
"mapping": self.mapping,
|
|
75
|
-
"key_name_remapping": self.key_name_remapping,
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
def __setstate__(self, state: ElasticStoreState) -> None:
|
|
79
|
-
ElasticStore.__init__(
|
|
80
|
-
self,
|
|
81
|
-
index=state["index"],
|
|
82
|
-
data_sql_schema=state["data_sql_schema"],
|
|
83
|
-
es_kwargs=state["es_kwargs"],
|
|
84
|
-
key_name_remapping=state["key_name_remapping"],
|
|
85
|
-
mapping=state["mapping"],
|
|
86
|
-
)
|
|
87
|
-
|
|
88
|
-
def insert_rows(self, df: DataDF) -> None:
|
|
89
|
-
if df.empty:
|
|
90
|
-
return
|
|
91
|
-
|
|
92
|
-
# previously index was implicitly created by the bulk api call, now explicit with mapping
|
|
93
|
-
index_exists = self.es_client.indices.exists(index=self.index)
|
|
94
|
-
if not index_exists:
|
|
95
|
-
self.es_client.indices.create(index=self.index, body=self.mapping)
|
|
96
|
-
|
|
97
|
-
actions = []
|
|
98
|
-
for row in df.to_dict(orient="records"): # type: ignore
|
|
99
|
-
# I need to retrieve data in chunks and restore the ids
|
|
100
|
-
# here ids are hashed, so I need to store the original ide values in _source
|
|
101
|
-
# since I cannot store the _id in source (ES will not validate request), I rename these fields
|
|
102
|
-
row_data: Dict[str, Any] = {key: row[key] for key in self.value_key_columns}
|
|
103
|
-
row_id = get_elastic_id([row[key] for key in self.primary_key_columns])
|
|
104
|
-
row_data = remap_dict_keys(row_data, self.key_name_remapping)
|
|
105
|
-
row_data.update(
|
|
106
|
-
{self.primary_key_column_rename.format(pk=key): row[key] for key in self.primary_key_columns}
|
|
107
|
-
)
|
|
108
|
-
actions.append({"_index": self.index, "_source": row_data, "_id": row_id})
|
|
109
|
-
|
|
110
|
-
helpers.bulk(client=self.es_client, actions=actions, refresh=True)
|
|
111
|
-
|
|
112
|
-
def read_rows(self, idx: Optional[IndexDF] = None) -> DataDF:
|
|
113
|
-
if idx is not None:
|
|
114
|
-
if idx.empty:
|
|
115
|
-
return pd.DataFrame(columns=[column.name for column in self.data_sql_schema])
|
|
116
|
-
|
|
117
|
-
key_rows = _to_itertuples(idx, self.primary_key_columns)
|
|
118
|
-
rows_ids = [get_elastic_id(row) for row in key_rows]
|
|
119
|
-
data = self.es_client.mget(index=self.index, body={"ids": rows_ids}, source=True)
|
|
120
|
-
data = data["docs"]
|
|
121
|
-
else:
|
|
122
|
-
# elasticsearch has default limit of 10000 per query
|
|
123
|
-
# I assume you will use the read_rows_meta_pseudo_df for larger result sets
|
|
124
|
-
data = self.es_client.search(index=self.index, query={"match_all": {}}, size=10000)
|
|
125
|
-
data = data["hits"]["hits"]
|
|
126
|
-
|
|
127
|
-
remapping_with_primary_keys = {
|
|
128
|
-
**self.key_name_remapping,
|
|
129
|
-
**{
|
|
130
|
-
self.primary_key_column_rename.format(pk=primary_key): f"{primary_key}"
|
|
131
|
-
for primary_key in self.primary_key_columns
|
|
132
|
-
},
|
|
133
|
-
}
|
|
134
|
-
result = [
|
|
135
|
-
remap_dict_keys(item["_source"], remapping_with_primary_keys) # type: ignore
|
|
136
|
-
for item in data
|
|
137
|
-
]
|
|
138
|
-
if result:
|
|
139
|
-
return pd.DataFrame(result)
|
|
140
|
-
else:
|
|
141
|
-
return pd.DataFrame(columns=self.primary_key_columns)
|
|
142
|
-
|
|
143
|
-
def read_rows_meta_pseudo_df(
|
|
144
|
-
self, chunksize: int = 1000, run_config: Optional[RunConfig] = None
|
|
145
|
-
) -> Iterator[DataDF]:
|
|
146
|
-
pit_timeout = "5m"
|
|
147
|
-
|
|
148
|
-
pit_resp = self.es_client.open_point_in_time(index=self.index, keep_alive=pit_timeout)
|
|
149
|
-
pit_id = pit_resp["id"]
|
|
150
|
-
|
|
151
|
-
query: dict
|
|
152
|
-
if run_config:
|
|
153
|
-
# run_config is not taken into account now
|
|
154
|
-
query = {"match_all": {}}
|
|
155
|
-
else:
|
|
156
|
-
query = {"match_all": {}}
|
|
157
|
-
|
|
158
|
-
data_resp: ObjectApiResponse[Any] | None
|
|
159
|
-
data_resp = self.es_client.search(
|
|
160
|
-
query=query,
|
|
161
|
-
sort=["_doc"],
|
|
162
|
-
pit={"id": pit_id, "keep_alive": pit_timeout},
|
|
163
|
-
size=chunksize,
|
|
164
|
-
)
|
|
165
|
-
if data_resp and len(data_resp["hits"]["hits"]) == 0:
|
|
166
|
-
data_resp = None
|
|
167
|
-
yield pd.DataFrame(columns=self.primary_key_columns)
|
|
168
|
-
|
|
169
|
-
while data_resp:
|
|
170
|
-
data = data_resp["hits"]["hits"]
|
|
171
|
-
last_search_result = data[-1]["sort"]
|
|
172
|
-
|
|
173
|
-
remapping_with_primary_keys = {
|
|
174
|
-
**self.key_name_remapping,
|
|
175
|
-
**{
|
|
176
|
-
self.primary_key_column_rename.format(pk=primary_key): f"{primary_key}"
|
|
177
|
-
for primary_key in self.primary_key_columns
|
|
178
|
-
},
|
|
179
|
-
}
|
|
180
|
-
result = [remap_dict_keys(item["_source"], remapping_with_primary_keys) for item in data]
|
|
181
|
-
yield pd.DataFrame(result)
|
|
182
|
-
|
|
183
|
-
data_resp = self.es_client.search(
|
|
184
|
-
query=query,
|
|
185
|
-
search_after=last_search_result,
|
|
186
|
-
sort=["_doc"],
|
|
187
|
-
pit={"id": pit_id, "keep_alive": pit_timeout},
|
|
188
|
-
size=chunksize,
|
|
189
|
-
)
|
|
190
|
-
if len(data_resp["hits"]["hits"]) == 0:
|
|
191
|
-
data_resp = None
|
|
192
|
-
|
|
193
|
-
self.es_client.close_point_in_time(id=pit_id)
|
|
194
|
-
|
|
195
|
-
def delete_rows(self, idx: IndexDF) -> None:
|
|
196
|
-
if idx.empty:
|
|
197
|
-
return
|
|
198
|
-
key_rows = _to_itertuples(idx, self.primary_key_columns)
|
|
199
|
-
rows_ids = [get_elastic_id(row) for row in key_rows]
|
|
200
|
-
actions = [{"_op_type": "delete", "_index": self.index, "_id": row_id} for row_id in rows_ids]
|
|
201
|
-
helpers.bulk(client=self.es_client, actions=actions, refresh=True)
|
|
202
|
-
|
|
203
|
-
def get_schema(self) -> DataSchema:
|
|
204
|
-
return self.data_sql_schema
|
|
205
|
-
|
|
206
|
-
def get_primary_schema(self) -> DataSchema:
|
|
207
|
-
return [column for column in self.data_sql_schema if column.primary_key]
|
|
208
|
-
|
|
209
|
-
def get_meta_schema(self) -> MetaSchema:
|
|
210
|
-
meta_key_prop = MetaKey.get_property_name()
|
|
211
|
-
return [column for column in self.data_sql_schema if hasattr(column, meta_key_prop)]
|
|
File without changes
|
|
@@ -1,307 +0,0 @@
|
|
|
1
|
-
# This is copy of concept of reusable test classes from `fsspec`
|
|
2
|
-
# https://github.com/fsspec/filesystem_spec/tree/master/fsspec/tests/abstract
|
|
3
|
-
|
|
4
|
-
from typing import Callable, Iterable, cast
|
|
5
|
-
|
|
6
|
-
import cloudpickle
|
|
7
|
-
import pandas as pd
|
|
8
|
-
import pytest
|
|
9
|
-
from sqlalchemy import Column, String
|
|
10
|
-
|
|
11
|
-
from datapipe.run_config import RunConfig
|
|
12
|
-
from datapipe.store.table_store import TableStore
|
|
13
|
-
from datapipe.store.tests.stubs import DATA_PARAMS
|
|
14
|
-
from datapipe.tests.util import assert_df_equal, assert_ts_contains
|
|
15
|
-
from datapipe.types import DataDF, DataSchema, IndexDF, data_to_index
|
|
16
|
-
|
|
17
|
-
TableStoreMaker = Callable[[DataSchema], TableStore]
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class AbstractBaseStoreFixtures:
|
|
21
|
-
@pytest.fixture
|
|
22
|
-
def store_maker(self) -> TableStoreMaker:
|
|
23
|
-
raise NotImplementedError("This function must be overridden in derived classes")
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
class AbstractBaseStoreTests:
|
|
27
|
-
def test_cloudpickle(self, store_maker: TableStoreMaker) -> None:
|
|
28
|
-
store = store_maker(
|
|
29
|
-
[
|
|
30
|
-
Column("id", String(), primary_key=True),
|
|
31
|
-
]
|
|
32
|
-
)
|
|
33
|
-
ser = cloudpickle.dumps(store)
|
|
34
|
-
cloudpickle.loads(ser)
|
|
35
|
-
|
|
36
|
-
# TODO assert store is the same
|
|
37
|
-
|
|
38
|
-
@pytest.mark.parametrize("data_df,schema", DATA_PARAMS)
|
|
39
|
-
def test_get_schema(
|
|
40
|
-
self,
|
|
41
|
-
store_maker: TableStoreMaker,
|
|
42
|
-
data_df: pd.DataFrame,
|
|
43
|
-
schema: DataSchema,
|
|
44
|
-
) -> None:
|
|
45
|
-
store = store_maker(schema)
|
|
46
|
-
|
|
47
|
-
if not store.caps.supports_get_schema:
|
|
48
|
-
raise pytest.skip("Store does not support get_schema")
|
|
49
|
-
|
|
50
|
-
assert store.get_schema() == schema
|
|
51
|
-
|
|
52
|
-
@pytest.mark.parametrize("data_df,schema", DATA_PARAMS)
|
|
53
|
-
def test_write_read_rows(
|
|
54
|
-
self,
|
|
55
|
-
store_maker: TableStoreMaker,
|
|
56
|
-
data_df: pd.DataFrame,
|
|
57
|
-
schema: DataSchema,
|
|
58
|
-
) -> None:
|
|
59
|
-
store = store_maker(schema)
|
|
60
|
-
store.insert_rows(data_df)
|
|
61
|
-
|
|
62
|
-
assert_ts_contains(store, data_df)
|
|
63
|
-
|
|
64
|
-
@pytest.mark.parametrize("data_df,schema", DATA_PARAMS)
|
|
65
|
-
def test_write_read_full_rows(
|
|
66
|
-
self, store_maker: TableStoreMaker, data_df: pd.DataFrame, schema: DataSchema
|
|
67
|
-
) -> None:
|
|
68
|
-
store = store_maker(schema)
|
|
69
|
-
|
|
70
|
-
if not store.caps.supports_read_all_rows:
|
|
71
|
-
raise pytest.skip("Store does not support read_all_rows")
|
|
72
|
-
|
|
73
|
-
store.insert_rows(data_df)
|
|
74
|
-
|
|
75
|
-
assert_df_equal(store.read_rows(), data_df, index_cols=store.primary_keys)
|
|
76
|
-
|
|
77
|
-
@pytest.mark.parametrize("data_df,schema", DATA_PARAMS)
|
|
78
|
-
def test_insert_identical_rows_twice_and_read_rows(
|
|
79
|
-
self,
|
|
80
|
-
store_maker: TableStoreMaker,
|
|
81
|
-
data_df: pd.DataFrame,
|
|
82
|
-
schema: DataSchema,
|
|
83
|
-
) -> None:
|
|
84
|
-
store = store_maker(schema)
|
|
85
|
-
|
|
86
|
-
store.insert_rows(data_df)
|
|
87
|
-
|
|
88
|
-
test_df_mod = data_df.copy()
|
|
89
|
-
test_df_mod.loc[50:, "price"] = test_df_mod.loc[50:, "price"] + 1
|
|
90
|
-
|
|
91
|
-
store.insert_rows(test_df_mod.loc[50:])
|
|
92
|
-
|
|
93
|
-
assert_ts_contains(store, test_df_mod)
|
|
94
|
-
|
|
95
|
-
@pytest.mark.parametrize("data_df,schema", DATA_PARAMS)
|
|
96
|
-
def test_read_non_existent_rows(
|
|
97
|
-
self,
|
|
98
|
-
store_maker: TableStoreMaker,
|
|
99
|
-
data_df: pd.DataFrame,
|
|
100
|
-
schema: DataSchema,
|
|
101
|
-
) -> None:
|
|
102
|
-
store = store_maker(schema)
|
|
103
|
-
|
|
104
|
-
if not store.caps.supports_read_nonexistent_rows:
|
|
105
|
-
raise pytest.skip("Store does not support read_nonexistent_rows")
|
|
106
|
-
|
|
107
|
-
test_df_to_store = data_df.drop(range(1, 5))
|
|
108
|
-
|
|
109
|
-
store.insert_rows(test_df_to_store)
|
|
110
|
-
|
|
111
|
-
assert_df_equal(
|
|
112
|
-
store.read_rows(data_to_index(data_df, store.primary_keys)),
|
|
113
|
-
test_df_to_store,
|
|
114
|
-
index_cols=store.primary_keys,
|
|
115
|
-
)
|
|
116
|
-
|
|
117
|
-
@pytest.mark.parametrize("data_df,schema", DATA_PARAMS)
|
|
118
|
-
def test_read_empty_df(
|
|
119
|
-
self,
|
|
120
|
-
store_maker: TableStoreMaker,
|
|
121
|
-
data_df: pd.DataFrame,
|
|
122
|
-
schema: DataSchema,
|
|
123
|
-
) -> None:
|
|
124
|
-
store = store_maker(schema)
|
|
125
|
-
store.insert_rows(data_df)
|
|
126
|
-
|
|
127
|
-
df_empty = pd.DataFrame()
|
|
128
|
-
|
|
129
|
-
df_result = store.read_rows(cast(IndexDF, df_empty))
|
|
130
|
-
assert df_result.empty
|
|
131
|
-
assert all(col in df_result.columns for col in store.primary_keys)
|
|
132
|
-
|
|
133
|
-
@pytest.mark.parametrize("data_df,schema", DATA_PARAMS)
|
|
134
|
-
def test_insert_empty_df(
|
|
135
|
-
self,
|
|
136
|
-
store_maker: TableStoreMaker,
|
|
137
|
-
data_df: pd.DataFrame,
|
|
138
|
-
schema: DataSchema,
|
|
139
|
-
) -> None:
|
|
140
|
-
store = store_maker(schema)
|
|
141
|
-
|
|
142
|
-
if not store.caps.supports_read_all_rows:
|
|
143
|
-
raise pytest.skip("Store does not support read_all_rows")
|
|
144
|
-
|
|
145
|
-
df_empty = pd.DataFrame()
|
|
146
|
-
store.insert_rows(df_empty)
|
|
147
|
-
|
|
148
|
-
df_result = store.read_rows()
|
|
149
|
-
assert df_result.empty
|
|
150
|
-
assert all(col in df_result.columns for col in store.primary_keys)
|
|
151
|
-
|
|
152
|
-
@pytest.mark.parametrize("data_df,schema", DATA_PARAMS)
|
|
153
|
-
def test_update_empty_df(
|
|
154
|
-
self,
|
|
155
|
-
store_maker: TableStoreMaker,
|
|
156
|
-
data_df: pd.DataFrame,
|
|
157
|
-
schema: DataSchema,
|
|
158
|
-
) -> None:
|
|
159
|
-
store = store_maker(schema)
|
|
160
|
-
|
|
161
|
-
if not store.caps.supports_read_all_rows:
|
|
162
|
-
raise pytest.skip("Store does not support read_all_rows")
|
|
163
|
-
|
|
164
|
-
df_empty = pd.DataFrame()
|
|
165
|
-
store.update_rows(df_empty)
|
|
166
|
-
|
|
167
|
-
df_result = store.read_rows()
|
|
168
|
-
assert df_result.empty
|
|
169
|
-
assert all(col in df_result.columns for col in store.primary_keys)
|
|
170
|
-
|
|
171
|
-
@pytest.mark.parametrize("data_df,schema", DATA_PARAMS)
|
|
172
|
-
def test_partial_update_rows(
|
|
173
|
-
self,
|
|
174
|
-
store_maker: TableStoreMaker,
|
|
175
|
-
data_df: pd.DataFrame,
|
|
176
|
-
schema: DataSchema,
|
|
177
|
-
) -> None:
|
|
178
|
-
store = store_maker(schema)
|
|
179
|
-
store.insert_rows(data_df)
|
|
180
|
-
|
|
181
|
-
assert_ts_contains(store, data_df)
|
|
182
|
-
|
|
183
|
-
test_df_mod = data_df.copy()
|
|
184
|
-
test_df_mod.loc[50:, "price"] = test_df_mod.loc[50:, "price"] + 1
|
|
185
|
-
|
|
186
|
-
store.update_rows(test_df_mod.loc[50:])
|
|
187
|
-
|
|
188
|
-
assert_ts_contains(store, test_df_mod)
|
|
189
|
-
|
|
190
|
-
@pytest.mark.parametrize("data_df,schema", DATA_PARAMS)
|
|
191
|
-
def test_full_update_rows(
|
|
192
|
-
self,
|
|
193
|
-
store_maker: TableStoreMaker,
|
|
194
|
-
data_df: pd.DataFrame,
|
|
195
|
-
schema: DataSchema,
|
|
196
|
-
) -> None:
|
|
197
|
-
store = store_maker(schema)
|
|
198
|
-
store.insert_rows(data_df)
|
|
199
|
-
|
|
200
|
-
assert_ts_contains(store, data_df)
|
|
201
|
-
|
|
202
|
-
data_df_mod = data_df.copy()
|
|
203
|
-
data_df_mod.loc[:, "price"] = data_df_mod.loc[:, "price"] + 1
|
|
204
|
-
|
|
205
|
-
store.update_rows(data_df_mod)
|
|
206
|
-
|
|
207
|
-
assert_ts_contains(store, data_df_mod)
|
|
208
|
-
|
|
209
|
-
# TODO add test which does not require read_all_rows support
|
|
210
|
-
@pytest.mark.parametrize("data_df,schema", DATA_PARAMS)
|
|
211
|
-
def test_delete_rows(
|
|
212
|
-
self,
|
|
213
|
-
store_maker: TableStoreMaker,
|
|
214
|
-
data_df: pd.DataFrame,
|
|
215
|
-
schema: DataSchema,
|
|
216
|
-
) -> None:
|
|
217
|
-
store = store_maker(schema)
|
|
218
|
-
|
|
219
|
-
if not store.caps.supports_delete:
|
|
220
|
-
raise pytest.skip("Store does not support delete")
|
|
221
|
-
if not store.caps.supports_read_all_rows:
|
|
222
|
-
raise pytest.skip("Store does not support read_all_rows")
|
|
223
|
-
|
|
224
|
-
store.insert_rows(data_df)
|
|
225
|
-
|
|
226
|
-
assert_df_equal(
|
|
227
|
-
store.read_rows(data_to_index(data_df, store.primary_keys)),
|
|
228
|
-
data_df,
|
|
229
|
-
index_cols=store.primary_keys,
|
|
230
|
-
)
|
|
231
|
-
|
|
232
|
-
store.delete_rows(cast(IndexDF, data_df.loc[20:50, store.primary_keys]))
|
|
233
|
-
|
|
234
|
-
assert_df_equal(
|
|
235
|
-
store.read_rows(),
|
|
236
|
-
pd.concat([data_df.loc[0:19], data_df.loc[51:]]),
|
|
237
|
-
index_cols=store.primary_keys,
|
|
238
|
-
)
|
|
239
|
-
|
|
240
|
-
@pytest.mark.parametrize("data_df,schema", DATA_PARAMS)
|
|
241
|
-
def test_read_rows_meta_pseudo_df(
|
|
242
|
-
self,
|
|
243
|
-
store_maker: TableStoreMaker,
|
|
244
|
-
data_df: pd.DataFrame,
|
|
245
|
-
schema: DataSchema,
|
|
246
|
-
) -> None:
|
|
247
|
-
store = store_maker(schema)
|
|
248
|
-
|
|
249
|
-
if not store.caps.supports_read_meta_pseudo_df:
|
|
250
|
-
raise pytest.skip("Store does not support read_meta_pseudo_df")
|
|
251
|
-
|
|
252
|
-
store.insert_rows(data_df)
|
|
253
|
-
|
|
254
|
-
assert_ts_contains(store, data_df)
|
|
255
|
-
|
|
256
|
-
pseudo_df_iter = store.read_rows_meta_pseudo_df()
|
|
257
|
-
|
|
258
|
-
assert isinstance(pseudo_df_iter, Iterable)
|
|
259
|
-
|
|
260
|
-
pseudo_df = pd.concat(pseudo_df_iter, ignore_index=True)
|
|
261
|
-
|
|
262
|
-
for pk in store.primary_keys:
|
|
263
|
-
assert pk in pseudo_df.columns
|
|
264
|
-
|
|
265
|
-
# TODO check that ids of pseudo_df equal to ids of data_df
|
|
266
|
-
|
|
267
|
-
@pytest.mark.parametrize("data_df,schema", DATA_PARAMS)
|
|
268
|
-
def test_read_empty_rows_meta_pseudo_df(
|
|
269
|
-
self,
|
|
270
|
-
store_maker: TableStoreMaker,
|
|
271
|
-
data_df: pd.DataFrame,
|
|
272
|
-
schema: DataSchema,
|
|
273
|
-
) -> None:
|
|
274
|
-
store = store_maker(schema)
|
|
275
|
-
|
|
276
|
-
if not store.caps.supports_read_meta_pseudo_df:
|
|
277
|
-
raise pytest.skip("Store does not support read_meta_pseudo_df")
|
|
278
|
-
|
|
279
|
-
pseudo_df_iter = store.read_rows_meta_pseudo_df()
|
|
280
|
-
assert isinstance(pseudo_df_iter, Iterable)
|
|
281
|
-
for pseudo_df in pseudo_df_iter:
|
|
282
|
-
assert isinstance(pseudo_df, DataDF)
|
|
283
|
-
pseudo_df[store.primary_keys] # Empty df must have primary keys columns
|
|
284
|
-
|
|
285
|
-
@pytest.mark.parametrize("data_df,schema", DATA_PARAMS)
|
|
286
|
-
def test_read_rows_meta_pseudo_df_with_runconfig(
|
|
287
|
-
self,
|
|
288
|
-
store_maker: TableStoreMaker,
|
|
289
|
-
data_df: pd.DataFrame,
|
|
290
|
-
schema: DataSchema,
|
|
291
|
-
) -> None:
|
|
292
|
-
store = store_maker(schema)
|
|
293
|
-
|
|
294
|
-
if not store.caps.supports_read_meta_pseudo_df:
|
|
295
|
-
raise pytest.skip("Store does not support read_meta_pseudo_df")
|
|
296
|
-
|
|
297
|
-
store.insert_rows(data_df)
|
|
298
|
-
|
|
299
|
-
assert_ts_contains(store, data_df)
|
|
300
|
-
|
|
301
|
-
# TODO проверять, что runconfig реально влияет на результирующие данные
|
|
302
|
-
pseudo_df_iter = store.read_rows_meta_pseudo_df(
|
|
303
|
-
run_config=RunConfig(filters={"a": 1})
|
|
304
|
-
)
|
|
305
|
-
assert isinstance(pseudo_df_iter, Iterable)
|
|
306
|
-
for pseudo_df in pseudo_df_iter:
|
|
307
|
-
assert isinstance(pseudo_df, DataDF)
|
|
@@ -1,89 +0,0 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
import pytest
|
|
3
|
-
from sqlalchemy import Column, Integer, String
|
|
4
|
-
|
|
5
|
-
DATA_PARAMS = [
|
|
6
|
-
pytest.param(
|
|
7
|
-
pd.DataFrame(
|
|
8
|
-
{
|
|
9
|
-
"id": range(100),
|
|
10
|
-
"name": [f"Product {i}" for i in range(100)],
|
|
11
|
-
"price": [1000 + i for i in range(100)],
|
|
12
|
-
}
|
|
13
|
-
),
|
|
14
|
-
[
|
|
15
|
-
Column("id", Integer, primary_key=True),
|
|
16
|
-
Column("name", String(100)),
|
|
17
|
-
Column("price", Integer),
|
|
18
|
-
],
|
|
19
|
-
id="int_id",
|
|
20
|
-
),
|
|
21
|
-
pytest.param(
|
|
22
|
-
pd.DataFrame(
|
|
23
|
-
{
|
|
24
|
-
"id": [f"id_{i}" for i in range(100)],
|
|
25
|
-
"name": [f"Product {i}" for i in range(100)],
|
|
26
|
-
"price": [1000 + i for i in range(100)],
|
|
27
|
-
}
|
|
28
|
-
),
|
|
29
|
-
[
|
|
30
|
-
Column("id", String(100), primary_key=True),
|
|
31
|
-
Column("name", String(100)),
|
|
32
|
-
Column("price", Integer),
|
|
33
|
-
],
|
|
34
|
-
id="str_id",
|
|
35
|
-
),
|
|
36
|
-
pytest.param(
|
|
37
|
-
pd.DataFrame(
|
|
38
|
-
{
|
|
39
|
-
"id_int": range(100),
|
|
40
|
-
"id_str": [f"id_{i}" for i in range(100)],
|
|
41
|
-
"name": [f"Product {i}" for i in range(100)],
|
|
42
|
-
"price": [1000 + i for i in range(100)],
|
|
43
|
-
}
|
|
44
|
-
),
|
|
45
|
-
[
|
|
46
|
-
Column("id_int", Integer, primary_key=True),
|
|
47
|
-
Column("id_str", String(100), primary_key=True),
|
|
48
|
-
Column("name", String(100)),
|
|
49
|
-
Column("price", Integer),
|
|
50
|
-
],
|
|
51
|
-
id="multi_id",
|
|
52
|
-
),
|
|
53
|
-
pytest.param(
|
|
54
|
-
pd.DataFrame(
|
|
55
|
-
{
|
|
56
|
-
"id1": [f"id_{i}" for i in range(1000)],
|
|
57
|
-
"id2": [f"id_{i}" for i in range(1000)],
|
|
58
|
-
"name": [f"Product {i}" for i in range(1000)],
|
|
59
|
-
"price": [1000 + i for i in range(1000)],
|
|
60
|
-
}
|
|
61
|
-
),
|
|
62
|
-
[
|
|
63
|
-
Column("id1", String(100), primary_key=True),
|
|
64
|
-
Column("id2", String(100), primary_key=True),
|
|
65
|
-
Column("name", String(100)),
|
|
66
|
-
Column("price", Integer),
|
|
67
|
-
],
|
|
68
|
-
id="double_id_1000_records",
|
|
69
|
-
),
|
|
70
|
-
pytest.param(
|
|
71
|
-
pd.DataFrame(
|
|
72
|
-
{
|
|
73
|
-
"id1": [f"id_{i}" for i in range(1000)],
|
|
74
|
-
"id2": [f"id_{i}" for i in range(1000)],
|
|
75
|
-
"id3": [f"id_{i}" for i in range(1000)],
|
|
76
|
-
"name": [f"Product {i}" for i in range(1000)],
|
|
77
|
-
"price": [1000 + i for i in range(1000)],
|
|
78
|
-
}
|
|
79
|
-
),
|
|
80
|
-
[
|
|
81
|
-
Column("id1", String(100), primary_key=True),
|
|
82
|
-
Column("id2", String(100), primary_key=True),
|
|
83
|
-
Column("id3", String(100), primary_key=True),
|
|
84
|
-
Column("name", String(100)),
|
|
85
|
-
Column("price", Integer),
|
|
86
|
-
],
|
|
87
|
-
id="triple_id_1000_records",
|
|
88
|
-
),
|
|
89
|
-
]
|
|
File without changes
|
|
@@ -1,59 +0,0 @@
|
|
|
1
|
-
from typing import List, cast
|
|
2
|
-
|
|
3
|
-
import pandas as pd
|
|
4
|
-
|
|
5
|
-
from datapipe.datatable import DataTable
|
|
6
|
-
from datapipe.store.table_store import TableStore
|
|
7
|
-
from datapipe.types import DataDF, IndexDF, data_to_index
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def assert_idx_equal(a, b):
|
|
11
|
-
a = sorted(list(a))
|
|
12
|
-
b = sorted(list(b))
|
|
13
|
-
|
|
14
|
-
assert a == b
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def assert_df_equal(a: pd.DataFrame, b: pd.DataFrame, index_cols=["id"]) -> bool:
|
|
18
|
-
a = a.set_index(index_cols)
|
|
19
|
-
b = b.set_index(index_cols)
|
|
20
|
-
|
|
21
|
-
assert_idx_equal(a.index, b.index)
|
|
22
|
-
|
|
23
|
-
eq_rows = (a.sort_index() == b.sort_index()).all(axis="columns")
|
|
24
|
-
|
|
25
|
-
if eq_rows.all():
|
|
26
|
-
return True
|
|
27
|
-
|
|
28
|
-
else:
|
|
29
|
-
print("Difference")
|
|
30
|
-
print("A:")
|
|
31
|
-
print(a.loc[-eq_rows])
|
|
32
|
-
print("B:")
|
|
33
|
-
print(b.loc[-eq_rows])
|
|
34
|
-
|
|
35
|
-
raise AssertionError
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
def assert_datatable_equal(a: DataTable, b: DataDF) -> bool:
|
|
39
|
-
return assert_df_equal(a.get_data(), b, index_cols=a.primary_keys)
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
def assert_ts_contains(ts: TableStore, df: DataDF):
|
|
43
|
-
assert_df_equal(
|
|
44
|
-
ts.read_rows(data_to_index(df, ts.primary_keys)),
|
|
45
|
-
df,
|
|
46
|
-
index_cols=ts.primary_keys,
|
|
47
|
-
)
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
def assert_idx_no_duplicates(idx: IndexDF, index_cols: List[str]) -> bool:
|
|
51
|
-
duplicates = cast(IndexDF, idx[idx[index_cols].duplicated()])
|
|
52
|
-
if len(duplicates) == 0:
|
|
53
|
-
return True
|
|
54
|
-
else:
|
|
55
|
-
idx = cast(IndexDF, idx.loc[idx.index].sort_values(index_cols))
|
|
56
|
-
print("Duplicated found:")
|
|
57
|
-
print(idx)
|
|
58
|
-
|
|
59
|
-
raise AssertionError
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/step/datatable_transform.py
RENAMED
|
File without changes
|
{datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/step/update_external_table.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|