datapipe-core 0.14.2.dev2__tar.gz → 0.15.0.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/.gitignore +0 -1
  2. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/PKG-INFO +1 -3
  3. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/store/database.py +10 -18
  4. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/store/filedir.py +8 -17
  5. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/store/redis.py +4 -20
  6. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/store/table_store.py +0 -20
  7. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/pyproject.toml +1 -5
  8. datapipe_core-0.14.2.dev2/datapipe/store/elastic.py +0 -211
  9. datapipe_core-0.14.2.dev2/datapipe/store/tests/__init__.py +0 -0
  10. datapipe_core-0.14.2.dev2/datapipe/store/tests/abstract.py +0 -307
  11. datapipe_core-0.14.2.dev2/datapipe/store/tests/stubs.py +0 -89
  12. datapipe_core-0.14.2.dev2/datapipe/tests/__init__.py +0 -0
  13. datapipe_core-0.14.2.dev2/datapipe/tests/util.py +0 -59
  14. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/LICENSE +0 -0
  15. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/README.md +0 -0
  16. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/__init__.py +0 -0
  17. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/cli.py +0 -0
  18. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/compute.py +0 -0
  19. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/datatable.py +0 -0
  20. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/event_logger.py +0 -0
  21. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/executor/__init__.py +0 -0
  22. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/executor/ray.py +0 -0
  23. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/lints.py +0 -0
  24. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/meta/__init__.py +0 -0
  25. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/meta/sql_meta.py +0 -0
  26. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/migrations/__init__.py +0 -0
  27. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/migrations/v013.py +0 -0
  28. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/py.typed +0 -0
  29. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/run_config.py +0 -0
  30. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/sql_util.py +0 -0
  31. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/step/__init__.py +0 -0
  32. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/step/batch_generate.py +0 -0
  33. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/step/batch_transform.py +0 -0
  34. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/step/datatable_transform.py +0 -0
  35. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/step/update_external_table.py +0 -0
  36. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/store/__init__.py +0 -0
  37. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/store/milvus.py +0 -0
  38. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/store/pandas.py +0 -0
  39. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/store/qdrant.py +0 -0
  40. {datapipe_core-0.14.2.dev2 → datapipe_core-0.15.0.dev1}/datapipe/types.py +0 -0
@@ -1,5 +1,4 @@
1
1
  .DS_Store
2
- .idea
3
2
  .mypy_cache/
4
3
  .pytest_cache/
5
4
  .venv/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datapipe-core
3
- Version: 0.14.2.dev2
3
+ Version: 0.15.0.dev1
4
4
  Summary: `datapipe` is a realtime incremental ETL library for Python application
5
5
  Project-URL: Repository, https://github.com/epoch8/datapipe
6
6
  Author-email: Andrey Tatarinov <a@tatarinov.co>
@@ -24,8 +24,6 @@ Requires-Dist: sqlalchemy<3.0.0,>=2.0.0
24
24
  Requires-Dist: tqdm-loggable<0.3,>=0.2
25
25
  Requires-Dist: traceback-with-variables<3,>=2.0.4
26
26
  Requires-Dist: types-tqdm>=4.67.0.20241221
27
- Provides-Extra: elastic
28
- Requires-Dist: elasticsearch>=8.17.1; extra == 'elastic'
29
27
  Provides-Extra: excel
30
28
  Requires-Dist: openpyxl>=3.0.7; extra == 'excel'
31
29
  Requires-Dist: xlrd>=2.0.1; extra == 'excel'
@@ -14,7 +14,7 @@ from sqlalchemy.sql.expression import delete, select
14
14
 
15
15
  from datapipe.run_config import RunConfig
16
16
  from datapipe.sql_util import sql_apply_idx_filter_to_table, sql_apply_runconfig_filter
17
- from datapipe.store.table_store import TableStore, TableStoreCaps
17
+ from datapipe.store.table_store import TableStore
18
18
  from datapipe.types import DataDF, DataSchema, IndexDF, MetaSchema, OrmTable, TAnyDF
19
19
 
20
20
  logger = logging.getLogger("datapipe.store.database")
@@ -121,14 +121,6 @@ class MetaKey(SchemaItem):
121
121
 
122
122
 
123
123
  class TableStoreDB(TableStore):
124
- caps = TableStoreCaps(
125
- supports_delete=True,
126
- supports_get_schema=True,
127
- supports_read_all_rows=True,
128
- supports_read_nonexistent_rows=True,
129
- supports_read_meta_pseudo_df=True,
130
- )
131
-
132
124
  def __init__(
133
125
  self,
134
126
  dbconn: Union["DBConn", str],
@@ -144,9 +136,9 @@ class TableStoreDB(TableStore):
144
136
 
145
137
  if orm_table is not None:
146
138
  assert name is None, "name should be None if orm_table is provided"
147
- assert data_sql_schema is None, (
148
- "data_sql_schema should be None if orm_table is provided"
149
- )
139
+ assert (
140
+ data_sql_schema is None
141
+ ), "data_sql_schema should be None if orm_table is provided"
150
142
 
151
143
  orm_table__table = orm_table.__table__ # type: ignore
152
144
  self.data_table = cast(Table, orm_table__table)
@@ -169,12 +161,12 @@ class TableStoreDB(TableStore):
169
161
  ]
170
162
 
171
163
  else:
172
- assert name is not None, (
173
- "name should be provided if data_table is not provided"
174
- )
175
- assert data_sql_schema is not None, (
176
- "data_sql_schema should be provided if data_table is not provided"
177
- )
164
+ assert (
165
+ name is not None
166
+ ), "name should be provided if data_table is not provided"
167
+ assert (
168
+ data_sql_schema is not None
169
+ ), "data_sql_schema should be provided if data_table is not provided"
178
170
 
179
171
  self.name = name
180
172
 
@@ -15,7 +15,7 @@ from PIL import Image
15
15
  from sqlalchemy import Column, Integer, String
16
16
 
17
17
  from datapipe.run_config import RunConfig
18
- from datapipe.store.table_store import TableStore, TableStoreCaps
18
+ from datapipe.store.table_store import TableStore
19
19
  from datapipe.types import DataDF, DataSchema, IndexDF, MetaSchema
20
20
 
21
21
 
@@ -103,9 +103,9 @@ def _pattern_to_attrnames(pat: str) -> List[str]:
103
103
  assert len(attrnames) > 0, "The scheme is not valid."
104
104
  if len(attrnames) >= 2:
105
105
  duplicates_attrnames = list(duplicates(attrnames))
106
- assert len(duplicates_attrnames) == 0, (
107
- f"Some keys are repeated: {duplicates_attrnames}. Rename them."
108
- )
106
+ assert (
107
+ len(duplicates_attrnames) == 0
108
+ ), f"Some keys are repeated: {duplicates_attrnames}. Rename them."
109
109
 
110
110
  return attrnames
111
111
 
@@ -161,14 +161,6 @@ class Replacer:
161
161
 
162
162
 
163
163
  class TableStoreFiledir(TableStore):
164
- caps = TableStoreCaps(
165
- supports_delete=True,
166
- supports_get_schema=False,
167
- supports_read_all_rows=True,
168
- supports_read_nonexistent_rows=False,
169
- supports_read_meta_pseudo_df=True,
170
- )
171
-
172
164
  def __init__(
173
165
  self,
174
166
  filename_pattern: Union[str, Path],
@@ -286,8 +278,7 @@ class TableStoreFiledir(TableStore):
286
278
  for attrname in self.attrnames
287
279
  ]
288
280
  self.attrname_to_cls = {
289
- column.name: type_to_cls[type(column.type)]
290
- for column in self.primary_schema # type: ignore
281
+ column.name: type_to_cls[type(column.type)] for column in self.primary_schema # type: ignore
291
282
  }
292
283
 
293
284
  def get_primary_schema(self) -> DataSchema:
@@ -331,9 +322,9 @@ class TableStoreFiledir(TableStore):
331
322
  """
332
323
  _, filepath = fsspec.core.split_protocol(filepath)
333
324
  m = re.match(self.filename_match, filepath)
334
- assert m is not None, (
335
- f"Filepath {filepath} does not match the pattern {self.filename_match}"
336
- )
325
+ assert (
326
+ m is not None
327
+ ), f"Filepath {filepath} does not match the pattern {self.filename_match}"
337
328
 
338
329
  data = {}
339
330
  for attrname in self.attrnames:
@@ -7,7 +7,7 @@ from redis.cluster import RedisCluster
7
7
  from sqlalchemy import Column
8
8
 
9
9
  from datapipe.store.database import MetaKey
10
- from datapipe.store.table_store import TableStore, TableStoreCaps
10
+ from datapipe.store.table_store import TableStore
11
11
  from datapipe.types import DataDF, DataSchema, IndexDF, MetaSchema, data_to_index
12
12
 
13
13
 
@@ -24,30 +24,14 @@ def _to_itertuples(df: DataDF, colnames):
24
24
 
25
25
 
26
26
  class RedisStore(TableStore):
27
- caps = TableStoreCaps(
28
- supports_delete=True,
29
- supports_get_schema=False,
30
- supports_read_all_rows=False,
31
- supports_read_nonexistent_rows=False, # TODO check
32
- supports_read_meta_pseudo_df=False,
33
- )
34
-
35
27
  def __init__(
36
- self,
37
- connection: str,
38
- name: str,
39
- data_sql_schema: List[Column],
40
- cluster_mode: bool = False,
28
+ self, connection: str, name: str, data_sql_schema: List[Column], cluster_mode: bool = False
41
29
  ) -> None:
42
30
  self.connection = connection
43
31
  if not cluster_mode:
44
- self.redis_connection: Union[Redis, RedisCluster] = Redis.from_url(
45
- connection, decode_responses=True
46
- )
32
+ self.redis_connection: Union[Redis, RedisCluster] = Redis.from_url(connection, decode_responses=True)
47
33
  else:
48
- self.redis_connection = RedisCluster.from_url(
49
- connection, decode_responses=True
50
- )
34
+ self.redis_connection = RedisCluster.from_url(connection, decode_responses=True)
51
35
 
52
36
  self.name = name
53
37
  self.data_sql_schema = data_sql_schema
@@ -1,5 +1,4 @@
1
1
  from abc import ABC
2
- from dataclasses import dataclass
3
2
  from pathlib import Path
4
3
  from typing import Iterator, List, Optional, Union
5
4
 
@@ -10,18 +9,7 @@ from datapipe.run_config import RunConfig
10
9
  from datapipe.types import DataDF, DataSchema, IndexDF, MetaSchema, data_to_index
11
10
 
12
11
 
13
- @dataclass
14
- class TableStoreCaps:
15
- supports_delete: bool
16
- supports_get_schema: bool
17
- supports_read_all_rows: bool
18
- supports_read_nonexistent_rows: bool
19
- supports_read_meta_pseudo_df: bool
20
-
21
-
22
12
  class TableStore(ABC):
23
- caps: TableStoreCaps
24
-
25
13
  def get_primary_schema(self) -> DataSchema:
26
14
  raise NotImplementedError
27
15
 
@@ -58,14 +46,6 @@ class TableStore(ABC):
58
46
 
59
47
 
60
48
  class TableDataSingleFileStore(TableStore):
61
- caps = TableStoreCaps(
62
- supports_delete=True,
63
- supports_get_schema=False,
64
- supports_read_all_rows=True,
65
- supports_read_nonexistent_rows=True,
66
- supports_read_meta_pseudo_df=True,
67
- )
68
-
69
49
  def __init__(
70
50
  self,
71
51
  filename: Union[Path, str, None] = None,
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "datapipe-core"
3
- version = "0.14.2-dev.2"
3
+ version = "0.15.0-dev.1"
4
4
  description = "`datapipe` is a realtime incremental ETL library for Python application"
5
5
  authors = [{ name = "Andrey Tatarinov", email = "a@tatarinov.co" }]
6
6
  readme = "README.md"
@@ -40,7 +40,6 @@ redis = ["redis>=4.3.4,<5"]
40
40
  qdrant = ["qdrant-client>=1.1.7,<2"]
41
41
  ray = ["ray[default]>=2.5.0,<3"]
42
42
  gcp = ["opentelemetry-exporter-gcp-trace"]
43
- elastic = ["elasticsearch>=8.17.1"]
44
43
 
45
44
  [project.urls]
46
45
  Repository = "https://github.com/epoch8/datapipe"
@@ -63,9 +62,6 @@ dev = [
63
62
  "types-PyYAML",
64
63
  ]
65
64
 
66
- [tool.ruff]
67
- line-length = 120
68
-
69
65
  [tool.hatch.build.targets.sdist]
70
66
  include = ["datapipe", "datapipe/py.typed"]
71
67
 
@@ -1,211 +0,0 @@
1
- import base64
2
- import hashlib
3
- from typing import Any, Dict, Iterable, Iterator, List, Optional, TypedDict
4
-
5
- import pandas as pd
6
- from elastic_transport import ObjectApiResponse
7
- from elasticsearch import Elasticsearch, helpers
8
- from sqlalchemy import Column
9
-
10
- from datapipe.run_config import RunConfig
11
- from datapipe.store.database import MetaKey
12
- from datapipe.store.table_store import TableStore, TableStoreCaps
13
- from datapipe.types import DataDF, DataSchema, IndexDF, MetaSchema
14
-
15
-
16
- def get_elastic_id(keys: Iterable[Any], length: int = 20) -> str:
17
- concatenated_keys = "".join([str(key) for key in keys])
18
- needed_bytes = length * 3 // 4
19
- hash_object = hashlib.sha256(concatenated_keys.encode("utf-8"))
20
- hash_bytes = hash_object.digest()[:needed_bytes]
21
- base64_encoded_id = base64.urlsafe_b64encode(hash_bytes).decode("utf-8")
22
- return base64_encoded_id[:length]
23
-
24
-
25
- def _to_itertuples(df: DataDF, colnames):
26
- return list(df[colnames].itertuples(index=False, name=None))
27
-
28
-
29
- def remap_dict_keys(data: Dict[str, Any], key_name_remapping: Dict[str, str]) -> Dict[str, Any]:
30
- return {key_name_remapping.get(key, key): value for key, value in data.items()}
31
-
32
-
33
- class ElasticStoreState(TypedDict):
34
- index: str
35
- data_sql_schema: List[Column]
36
- es_kwargs: Dict[str, Any]
37
- key_name_remapping: Optional[Dict[str, str]]
38
- mapping: Optional[dict]
39
-
40
-
41
- class ElasticStore(TableStore):
42
- caps = TableStoreCaps(
43
- supports_delete=True,
44
- supports_read_all_rows=True,
45
- supports_get_schema=True,
46
- supports_read_meta_pseudo_df=True,
47
- supports_read_nonexistent_rows=False,
48
- )
49
-
50
- def __init__(
51
- self,
52
- index: str,
53
- data_sql_schema: List[Column],
54
- es_kwargs: Dict[str, Any],
55
- key_name_remapping: Optional[Dict[str, str]] = None,
56
- mapping: Optional[dict] = None,
57
- ) -> None:
58
- self.index = index
59
- self.data_sql_schema = data_sql_schema
60
- self.key_name_remapping = key_name_remapping or {}
61
- self.primary_key_columns = [column.name for column in self.data_sql_schema if column.primary_key]
62
- self.value_key_columns = [column.name for column in self.data_sql_schema if not column.primary_key]
63
- self.primary_key_column_rename = "_dtp_orig_{pk}"
64
- self.mapping = mapping
65
-
66
- self.es_kwargs = es_kwargs
67
- self.es_client = Elasticsearch(**es_kwargs)
68
-
69
- def __getstate__(self) -> ElasticStoreState:
70
- return {
71
- "index": self.index,
72
- "data_sql_schema": self.data_sql_schema,
73
- "es_kwargs": self.es_kwargs,
74
- "mapping": self.mapping,
75
- "key_name_remapping": self.key_name_remapping,
76
- }
77
-
78
- def __setstate__(self, state: ElasticStoreState) -> None:
79
- ElasticStore.__init__(
80
- self,
81
- index=state["index"],
82
- data_sql_schema=state["data_sql_schema"],
83
- es_kwargs=state["es_kwargs"],
84
- key_name_remapping=state["key_name_remapping"],
85
- mapping=state["mapping"],
86
- )
87
-
88
- def insert_rows(self, df: DataDF) -> None:
89
- if df.empty:
90
- return
91
-
92
- # previously index was implicitly created by the bulk api call, now explicit with mapping
93
- index_exists = self.es_client.indices.exists(index=self.index)
94
- if not index_exists:
95
- self.es_client.indices.create(index=self.index, body=self.mapping)
96
-
97
- actions = []
98
- for row in df.to_dict(orient="records"): # type: ignore
99
- # I need to retrieve data in chunks and restore the ids
100
- # here ids are hashed, so I need to store the original ide values in _source
101
- # since I cannot store the _id in source (ES will not validate request), I rename these fields
102
- row_data: Dict[str, Any] = {key: row[key] for key in self.value_key_columns}
103
- row_id = get_elastic_id([row[key] for key in self.primary_key_columns])
104
- row_data = remap_dict_keys(row_data, self.key_name_remapping)
105
- row_data.update(
106
- {self.primary_key_column_rename.format(pk=key): row[key] for key in self.primary_key_columns}
107
- )
108
- actions.append({"_index": self.index, "_source": row_data, "_id": row_id})
109
-
110
- helpers.bulk(client=self.es_client, actions=actions, refresh=True)
111
-
112
- def read_rows(self, idx: Optional[IndexDF] = None) -> DataDF:
113
- if idx is not None:
114
- if idx.empty:
115
- return pd.DataFrame(columns=[column.name for column in self.data_sql_schema])
116
-
117
- key_rows = _to_itertuples(idx, self.primary_key_columns)
118
- rows_ids = [get_elastic_id(row) for row in key_rows]
119
- data = self.es_client.mget(index=self.index, body={"ids": rows_ids}, source=True)
120
- data = data["docs"]
121
- else:
122
- # elasticsearch has default limit of 10000 per query
123
- # I assume you will use the read_rows_meta_pseudo_df for larger result sets
124
- data = self.es_client.search(index=self.index, query={"match_all": {}}, size=10000)
125
- data = data["hits"]["hits"]
126
-
127
- remapping_with_primary_keys = {
128
- **self.key_name_remapping,
129
- **{
130
- self.primary_key_column_rename.format(pk=primary_key): f"{primary_key}"
131
- for primary_key in self.primary_key_columns
132
- },
133
- }
134
- result = [
135
- remap_dict_keys(item["_source"], remapping_with_primary_keys) # type: ignore
136
- for item in data
137
- ]
138
- if result:
139
- return pd.DataFrame(result)
140
- else:
141
- return pd.DataFrame(columns=self.primary_key_columns)
142
-
143
- def read_rows_meta_pseudo_df(
144
- self, chunksize: int = 1000, run_config: Optional[RunConfig] = None
145
- ) -> Iterator[DataDF]:
146
- pit_timeout = "5m"
147
-
148
- pit_resp = self.es_client.open_point_in_time(index=self.index, keep_alive=pit_timeout)
149
- pit_id = pit_resp["id"]
150
-
151
- query: dict
152
- if run_config:
153
- # run_config is not taken into account now
154
- query = {"match_all": {}}
155
- else:
156
- query = {"match_all": {}}
157
-
158
- data_resp: ObjectApiResponse[Any] | None
159
- data_resp = self.es_client.search(
160
- query=query,
161
- sort=["_doc"],
162
- pit={"id": pit_id, "keep_alive": pit_timeout},
163
- size=chunksize,
164
- )
165
- if data_resp and len(data_resp["hits"]["hits"]) == 0:
166
- data_resp = None
167
- yield pd.DataFrame(columns=self.primary_key_columns)
168
-
169
- while data_resp:
170
- data = data_resp["hits"]["hits"]
171
- last_search_result = data[-1]["sort"]
172
-
173
- remapping_with_primary_keys = {
174
- **self.key_name_remapping,
175
- **{
176
- self.primary_key_column_rename.format(pk=primary_key): f"{primary_key}"
177
- for primary_key in self.primary_key_columns
178
- },
179
- }
180
- result = [remap_dict_keys(item["_source"], remapping_with_primary_keys) for item in data]
181
- yield pd.DataFrame(result)
182
-
183
- data_resp = self.es_client.search(
184
- query=query,
185
- search_after=last_search_result,
186
- sort=["_doc"],
187
- pit={"id": pit_id, "keep_alive": pit_timeout},
188
- size=chunksize,
189
- )
190
- if len(data_resp["hits"]["hits"]) == 0:
191
- data_resp = None
192
-
193
- self.es_client.close_point_in_time(id=pit_id)
194
-
195
- def delete_rows(self, idx: IndexDF) -> None:
196
- if idx.empty:
197
- return
198
- key_rows = _to_itertuples(idx, self.primary_key_columns)
199
- rows_ids = [get_elastic_id(row) for row in key_rows]
200
- actions = [{"_op_type": "delete", "_index": self.index, "_id": row_id} for row_id in rows_ids]
201
- helpers.bulk(client=self.es_client, actions=actions, refresh=True)
202
-
203
- def get_schema(self) -> DataSchema:
204
- return self.data_sql_schema
205
-
206
- def get_primary_schema(self) -> DataSchema:
207
- return [column for column in self.data_sql_schema if column.primary_key]
208
-
209
- def get_meta_schema(self) -> MetaSchema:
210
- meta_key_prop = MetaKey.get_property_name()
211
- return [column for column in self.data_sql_schema if hasattr(column, meta_key_prop)]
@@ -1,307 +0,0 @@
1
- # This is copy of concept of reusable test classes from `fsspec`
2
- # https://github.com/fsspec/filesystem_spec/tree/master/fsspec/tests/abstract
3
-
4
- from typing import Callable, Iterable, cast
5
-
6
- import cloudpickle
7
- import pandas as pd
8
- import pytest
9
- from sqlalchemy import Column, String
10
-
11
- from datapipe.run_config import RunConfig
12
- from datapipe.store.table_store import TableStore
13
- from datapipe.store.tests.stubs import DATA_PARAMS
14
- from datapipe.tests.util import assert_df_equal, assert_ts_contains
15
- from datapipe.types import DataDF, DataSchema, IndexDF, data_to_index
16
-
17
- TableStoreMaker = Callable[[DataSchema], TableStore]
18
-
19
-
20
- class AbstractBaseStoreFixtures:
21
- @pytest.fixture
22
- def store_maker(self) -> TableStoreMaker:
23
- raise NotImplementedError("This function must be overridden in derived classes")
24
-
25
-
26
- class AbstractBaseStoreTests:
27
- def test_cloudpickle(self, store_maker: TableStoreMaker) -> None:
28
- store = store_maker(
29
- [
30
- Column("id", String(), primary_key=True),
31
- ]
32
- )
33
- ser = cloudpickle.dumps(store)
34
- cloudpickle.loads(ser)
35
-
36
- # TODO assert store is the same
37
-
38
- @pytest.mark.parametrize("data_df,schema", DATA_PARAMS)
39
- def test_get_schema(
40
- self,
41
- store_maker: TableStoreMaker,
42
- data_df: pd.DataFrame,
43
- schema: DataSchema,
44
- ) -> None:
45
- store = store_maker(schema)
46
-
47
- if not store.caps.supports_get_schema:
48
- raise pytest.skip("Store does not support get_schema")
49
-
50
- assert store.get_schema() == schema
51
-
52
- @pytest.mark.parametrize("data_df,schema", DATA_PARAMS)
53
- def test_write_read_rows(
54
- self,
55
- store_maker: TableStoreMaker,
56
- data_df: pd.DataFrame,
57
- schema: DataSchema,
58
- ) -> None:
59
- store = store_maker(schema)
60
- store.insert_rows(data_df)
61
-
62
- assert_ts_contains(store, data_df)
63
-
64
- @pytest.mark.parametrize("data_df,schema", DATA_PARAMS)
65
- def test_write_read_full_rows(
66
- self, store_maker: TableStoreMaker, data_df: pd.DataFrame, schema: DataSchema
67
- ) -> None:
68
- store = store_maker(schema)
69
-
70
- if not store.caps.supports_read_all_rows:
71
- raise pytest.skip("Store does not support read_all_rows")
72
-
73
- store.insert_rows(data_df)
74
-
75
- assert_df_equal(store.read_rows(), data_df, index_cols=store.primary_keys)
76
-
77
- @pytest.mark.parametrize("data_df,schema", DATA_PARAMS)
78
- def test_insert_identical_rows_twice_and_read_rows(
79
- self,
80
- store_maker: TableStoreMaker,
81
- data_df: pd.DataFrame,
82
- schema: DataSchema,
83
- ) -> None:
84
- store = store_maker(schema)
85
-
86
- store.insert_rows(data_df)
87
-
88
- test_df_mod = data_df.copy()
89
- test_df_mod.loc[50:, "price"] = test_df_mod.loc[50:, "price"] + 1
90
-
91
- store.insert_rows(test_df_mod.loc[50:])
92
-
93
- assert_ts_contains(store, test_df_mod)
94
-
95
- @pytest.mark.parametrize("data_df,schema", DATA_PARAMS)
96
- def test_read_non_existent_rows(
97
- self,
98
- store_maker: TableStoreMaker,
99
- data_df: pd.DataFrame,
100
- schema: DataSchema,
101
- ) -> None:
102
- store = store_maker(schema)
103
-
104
- if not store.caps.supports_read_nonexistent_rows:
105
- raise pytest.skip("Store does not support read_nonexistent_rows")
106
-
107
- test_df_to_store = data_df.drop(range(1, 5))
108
-
109
- store.insert_rows(test_df_to_store)
110
-
111
- assert_df_equal(
112
- store.read_rows(data_to_index(data_df, store.primary_keys)),
113
- test_df_to_store,
114
- index_cols=store.primary_keys,
115
- )
116
-
117
- @pytest.mark.parametrize("data_df,schema", DATA_PARAMS)
118
- def test_read_empty_df(
119
- self,
120
- store_maker: TableStoreMaker,
121
- data_df: pd.DataFrame,
122
- schema: DataSchema,
123
- ) -> None:
124
- store = store_maker(schema)
125
- store.insert_rows(data_df)
126
-
127
- df_empty = pd.DataFrame()
128
-
129
- df_result = store.read_rows(cast(IndexDF, df_empty))
130
- assert df_result.empty
131
- assert all(col in df_result.columns for col in store.primary_keys)
132
-
133
- @pytest.mark.parametrize("data_df,schema", DATA_PARAMS)
134
- def test_insert_empty_df(
135
- self,
136
- store_maker: TableStoreMaker,
137
- data_df: pd.DataFrame,
138
- schema: DataSchema,
139
- ) -> None:
140
- store = store_maker(schema)
141
-
142
- if not store.caps.supports_read_all_rows:
143
- raise pytest.skip("Store does not support read_all_rows")
144
-
145
- df_empty = pd.DataFrame()
146
- store.insert_rows(df_empty)
147
-
148
- df_result = store.read_rows()
149
- assert df_result.empty
150
- assert all(col in df_result.columns for col in store.primary_keys)
151
-
152
- @pytest.mark.parametrize("data_df,schema", DATA_PARAMS)
153
- def test_update_empty_df(
154
- self,
155
- store_maker: TableStoreMaker,
156
- data_df: pd.DataFrame,
157
- schema: DataSchema,
158
- ) -> None:
159
- store = store_maker(schema)
160
-
161
- if not store.caps.supports_read_all_rows:
162
- raise pytest.skip("Store does not support read_all_rows")
163
-
164
- df_empty = pd.DataFrame()
165
- store.update_rows(df_empty)
166
-
167
- df_result = store.read_rows()
168
- assert df_result.empty
169
- assert all(col in df_result.columns for col in store.primary_keys)
170
-
171
- @pytest.mark.parametrize("data_df,schema", DATA_PARAMS)
172
- def test_partial_update_rows(
173
- self,
174
- store_maker: TableStoreMaker,
175
- data_df: pd.DataFrame,
176
- schema: DataSchema,
177
- ) -> None:
178
- store = store_maker(schema)
179
- store.insert_rows(data_df)
180
-
181
- assert_ts_contains(store, data_df)
182
-
183
- test_df_mod = data_df.copy()
184
- test_df_mod.loc[50:, "price"] = test_df_mod.loc[50:, "price"] + 1
185
-
186
- store.update_rows(test_df_mod.loc[50:])
187
-
188
- assert_ts_contains(store, test_df_mod)
189
-
190
- @pytest.mark.parametrize("data_df,schema", DATA_PARAMS)
191
- def test_full_update_rows(
192
- self,
193
- store_maker: TableStoreMaker,
194
- data_df: pd.DataFrame,
195
- schema: DataSchema,
196
- ) -> None:
197
- store = store_maker(schema)
198
- store.insert_rows(data_df)
199
-
200
- assert_ts_contains(store, data_df)
201
-
202
- data_df_mod = data_df.copy()
203
- data_df_mod.loc[:, "price"] = data_df_mod.loc[:, "price"] + 1
204
-
205
- store.update_rows(data_df_mod)
206
-
207
- assert_ts_contains(store, data_df_mod)
208
-
209
- # TODO add test which does not require read_all_rows support
210
- @pytest.mark.parametrize("data_df,schema", DATA_PARAMS)
211
- def test_delete_rows(
212
- self,
213
- store_maker: TableStoreMaker,
214
- data_df: pd.DataFrame,
215
- schema: DataSchema,
216
- ) -> None:
217
- store = store_maker(schema)
218
-
219
- if not store.caps.supports_delete:
220
- raise pytest.skip("Store does not support delete")
221
- if not store.caps.supports_read_all_rows:
222
- raise pytest.skip("Store does not support read_all_rows")
223
-
224
- store.insert_rows(data_df)
225
-
226
- assert_df_equal(
227
- store.read_rows(data_to_index(data_df, store.primary_keys)),
228
- data_df,
229
- index_cols=store.primary_keys,
230
- )
231
-
232
- store.delete_rows(cast(IndexDF, data_df.loc[20:50, store.primary_keys]))
233
-
234
- assert_df_equal(
235
- store.read_rows(),
236
- pd.concat([data_df.loc[0:19], data_df.loc[51:]]),
237
- index_cols=store.primary_keys,
238
- )
239
-
240
- @pytest.mark.parametrize("data_df,schema", DATA_PARAMS)
241
- def test_read_rows_meta_pseudo_df(
242
- self,
243
- store_maker: TableStoreMaker,
244
- data_df: pd.DataFrame,
245
- schema: DataSchema,
246
- ) -> None:
247
- store = store_maker(schema)
248
-
249
- if not store.caps.supports_read_meta_pseudo_df:
250
- raise pytest.skip("Store does not support read_meta_pseudo_df")
251
-
252
- store.insert_rows(data_df)
253
-
254
- assert_ts_contains(store, data_df)
255
-
256
- pseudo_df_iter = store.read_rows_meta_pseudo_df()
257
-
258
- assert isinstance(pseudo_df_iter, Iterable)
259
-
260
- pseudo_df = pd.concat(pseudo_df_iter, ignore_index=True)
261
-
262
- for pk in store.primary_keys:
263
- assert pk in pseudo_df.columns
264
-
265
- # TODO check that ids of pseudo_df equal to ids of data_df
266
-
267
- @pytest.mark.parametrize("data_df,schema", DATA_PARAMS)
268
- def test_read_empty_rows_meta_pseudo_df(
269
- self,
270
- store_maker: TableStoreMaker,
271
- data_df: pd.DataFrame,
272
- schema: DataSchema,
273
- ) -> None:
274
- store = store_maker(schema)
275
-
276
- if not store.caps.supports_read_meta_pseudo_df:
277
- raise pytest.skip("Store does not support read_meta_pseudo_df")
278
-
279
- pseudo_df_iter = store.read_rows_meta_pseudo_df()
280
- assert isinstance(pseudo_df_iter, Iterable)
281
- for pseudo_df in pseudo_df_iter:
282
- assert isinstance(pseudo_df, DataDF)
283
- pseudo_df[store.primary_keys] # Empty df must have primary keys columns
284
-
285
- @pytest.mark.parametrize("data_df,schema", DATA_PARAMS)
286
- def test_read_rows_meta_pseudo_df_with_runconfig(
287
- self,
288
- store_maker: TableStoreMaker,
289
- data_df: pd.DataFrame,
290
- schema: DataSchema,
291
- ) -> None:
292
- store = store_maker(schema)
293
-
294
- if not store.caps.supports_read_meta_pseudo_df:
295
- raise pytest.skip("Store does not support read_meta_pseudo_df")
296
-
297
- store.insert_rows(data_df)
298
-
299
- assert_ts_contains(store, data_df)
300
-
301
- # TODO проверять, что runconfig реально влияет на результирующие данные
302
- pseudo_df_iter = store.read_rows_meta_pseudo_df(
303
- run_config=RunConfig(filters={"a": 1})
304
- )
305
- assert isinstance(pseudo_df_iter, Iterable)
306
- for pseudo_df in pseudo_df_iter:
307
- assert isinstance(pseudo_df, DataDF)
@@ -1,89 +0,0 @@
1
- import pandas as pd
2
- import pytest
3
- from sqlalchemy import Column, Integer, String
4
-
5
- DATA_PARAMS = [
6
- pytest.param(
7
- pd.DataFrame(
8
- {
9
- "id": range(100),
10
- "name": [f"Product {i}" for i in range(100)],
11
- "price": [1000 + i for i in range(100)],
12
- }
13
- ),
14
- [
15
- Column("id", Integer, primary_key=True),
16
- Column("name", String(100)),
17
- Column("price", Integer),
18
- ],
19
- id="int_id",
20
- ),
21
- pytest.param(
22
- pd.DataFrame(
23
- {
24
- "id": [f"id_{i}" for i in range(100)],
25
- "name": [f"Product {i}" for i in range(100)],
26
- "price": [1000 + i for i in range(100)],
27
- }
28
- ),
29
- [
30
- Column("id", String(100), primary_key=True),
31
- Column("name", String(100)),
32
- Column("price", Integer),
33
- ],
34
- id="str_id",
35
- ),
36
- pytest.param(
37
- pd.DataFrame(
38
- {
39
- "id_int": range(100),
40
- "id_str": [f"id_{i}" for i in range(100)],
41
- "name": [f"Product {i}" for i in range(100)],
42
- "price": [1000 + i for i in range(100)],
43
- }
44
- ),
45
- [
46
- Column("id_int", Integer, primary_key=True),
47
- Column("id_str", String(100), primary_key=True),
48
- Column("name", String(100)),
49
- Column("price", Integer),
50
- ],
51
- id="multi_id",
52
- ),
53
- pytest.param(
54
- pd.DataFrame(
55
- {
56
- "id1": [f"id_{i}" for i in range(1000)],
57
- "id2": [f"id_{i}" for i in range(1000)],
58
- "name": [f"Product {i}" for i in range(1000)],
59
- "price": [1000 + i for i in range(1000)],
60
- }
61
- ),
62
- [
63
- Column("id1", String(100), primary_key=True),
64
- Column("id2", String(100), primary_key=True),
65
- Column("name", String(100)),
66
- Column("price", Integer),
67
- ],
68
- id="double_id_1000_records",
69
- ),
70
- pytest.param(
71
- pd.DataFrame(
72
- {
73
- "id1": [f"id_{i}" for i in range(1000)],
74
- "id2": [f"id_{i}" for i in range(1000)],
75
- "id3": [f"id_{i}" for i in range(1000)],
76
- "name": [f"Product {i}" for i in range(1000)],
77
- "price": [1000 + i for i in range(1000)],
78
- }
79
- ),
80
- [
81
- Column("id1", String(100), primary_key=True),
82
- Column("id2", String(100), primary_key=True),
83
- Column("id3", String(100), primary_key=True),
84
- Column("name", String(100)),
85
- Column("price", Integer),
86
- ],
87
- id="triple_id_1000_records",
88
- ),
89
- ]
File without changes
@@ -1,59 +0,0 @@
1
- from typing import List, cast
2
-
3
- import pandas as pd
4
-
5
- from datapipe.datatable import DataTable
6
- from datapipe.store.table_store import TableStore
7
- from datapipe.types import DataDF, IndexDF, data_to_index
8
-
9
-
10
- def assert_idx_equal(a, b):
11
- a = sorted(list(a))
12
- b = sorted(list(b))
13
-
14
- assert a == b
15
-
16
-
17
- def assert_df_equal(a: pd.DataFrame, b: pd.DataFrame, index_cols=["id"]) -> bool:
18
- a = a.set_index(index_cols)
19
- b = b.set_index(index_cols)
20
-
21
- assert_idx_equal(a.index, b.index)
22
-
23
- eq_rows = (a.sort_index() == b.sort_index()).all(axis="columns")
24
-
25
- if eq_rows.all():
26
- return True
27
-
28
- else:
29
- print("Difference")
30
- print("A:")
31
- print(a.loc[-eq_rows])
32
- print("B:")
33
- print(b.loc[-eq_rows])
34
-
35
- raise AssertionError
36
-
37
-
38
- def assert_datatable_equal(a: DataTable, b: DataDF) -> bool:
39
- return assert_df_equal(a.get_data(), b, index_cols=a.primary_keys)
40
-
41
-
42
- def assert_ts_contains(ts: TableStore, df: DataDF):
43
- assert_df_equal(
44
- ts.read_rows(data_to_index(df, ts.primary_keys)),
45
- df,
46
- index_cols=ts.primary_keys,
47
- )
48
-
49
-
50
- def assert_idx_no_duplicates(idx: IndexDF, index_cols: List[str]) -> bool:
51
- duplicates = cast(IndexDF, idx[idx[index_cols].duplicated()])
52
- if len(duplicates) == 0:
53
- return True
54
- else:
55
- idx = cast(IndexDF, idx.loc[idx.index].sort_values(index_cols))
56
- print("Duplicated found:")
57
- print(idx)
58
-
59
- raise AssertionError