cratedb-toolkit 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cratedb_toolkit/__init__.py +11 -0
- cratedb_toolkit/adapter/__init__.py +0 -0
- cratedb_toolkit/adapter/pymongo/__init__.py +1 -0
- cratedb_toolkit/adapter/pymongo/api.py +77 -0
- cratedb_toolkit/adapter/pymongo/collection.py +156 -0
- cratedb_toolkit/adapter/pymongo/cursor.py +390 -0
- cratedb_toolkit/adapter/pymongo/reactor.py +74 -0
- cratedb_toolkit/adapter/pymongo/util.py +51 -0
- cratedb_toolkit/adapter/rockset/__init__.py +0 -0
- cratedb_toolkit/adapter/rockset/cli.py +55 -0
- cratedb_toolkit/adapter/rockset/server/__init__.py +0 -0
- cratedb_toolkit/adapter/rockset/server/api/__init__.py +0 -0
- cratedb_toolkit/adapter/rockset/server/api/collection.py +8 -0
- cratedb_toolkit/adapter/rockset/server/api/document.py +91 -0
- cratedb_toolkit/adapter/rockset/server/api/query.py +68 -0
- cratedb_toolkit/adapter/rockset/server/dependencies.py +13 -0
- cratedb_toolkit/adapter/rockset/server/main.py +28 -0
- cratedb_toolkit/api/__init__.py +0 -0
- cratedb_toolkit/api/cli.py +31 -0
- cratedb_toolkit/api/guide.py +38 -0
- cratedb_toolkit/api/main.py +183 -0
- cratedb_toolkit/cfr/__init__.py +0 -0
- cratedb_toolkit/cfr/cli.py +195 -0
- cratedb_toolkit/cfr/info.py +59 -0
- cratedb_toolkit/cfr/jobstats.py +272 -0
- cratedb_toolkit/cfr/marimo.py +308 -0
- cratedb_toolkit/cfr/systable.py +257 -0
- cratedb_toolkit/cli.py +36 -0
- cratedb_toolkit/cluster/__init__.py +0 -0
- cratedb_toolkit/cluster/cli.py +43 -0
- cratedb_toolkit/cluster/croud.py +37 -0
- cratedb_toolkit/cluster/util.py +13 -0
- cratedb_toolkit/cmd/__init__.py +0 -0
- cratedb_toolkit/cmd/tail/__init__.py +0 -0
- cratedb_toolkit/cmd/tail/cli.py +56 -0
- cratedb_toolkit/cmd/tail/main.py +136 -0
- cratedb_toolkit/datasets/__init__.py +1 -0
- cratedb_toolkit/datasets/core.py +9 -0
- cratedb_toolkit/datasets/kaggle.py +49 -0
- cratedb_toolkit/datasets/model.py +138 -0
- cratedb_toolkit/datasets/store.py +3 -0
- cratedb_toolkit/datasets/tutorial.py +117 -0
- cratedb_toolkit/datasets/util.py +27 -0
- cratedb_toolkit/docs/__init__.py +0 -0
- cratedb_toolkit/docs/cli.py +112 -0
- cratedb_toolkit/docs/functions.py +133 -0
- cratedb_toolkit/docs/model.py +26 -0
- cratedb_toolkit/docs/settings.py +728 -0
- cratedb_toolkit/docs/util.py +61 -0
- cratedb_toolkit/exception.py +10 -0
- cratedb_toolkit/iac/__init__.py +0 -0
- cratedb_toolkit/iac/aws.py +10 -0
- cratedb_toolkit/info/__init__.py +0 -0
- cratedb_toolkit/info/cli.py +98 -0
- cratedb_toolkit/info/core.py +93 -0
- cratedb_toolkit/info/http.py +41 -0
- cratedb_toolkit/info/library.py +596 -0
- cratedb_toolkit/info/model.py +86 -0
- cratedb_toolkit/info/util.py +21 -0
- cratedb_toolkit/io/__init__.py +0 -0
- cratedb_toolkit/io/cli.py +92 -0
- cratedb_toolkit/io/core.py +189 -0
- cratedb_toolkit/io/croud.py +196 -0
- cratedb_toolkit/io/dynamodb/__init__.py +0 -0
- cratedb_toolkit/io/dynamodb/adapter.py +73 -0
- cratedb_toolkit/io/dynamodb/api.py +40 -0
- cratedb_toolkit/io/dynamodb/copy.py +96 -0
- cratedb_toolkit/io/influxdb.py +27 -0
- cratedb_toolkit/io/kinesis/__init__.py +0 -0
- cratedb_toolkit/io/kinesis/adapter.py +129 -0
- cratedb_toolkit/io/kinesis/api.py +6 -0
- cratedb_toolkit/io/kinesis/relay.py +91 -0
- cratedb_toolkit/io/mongodb/__init__.py +0 -0
- cratedb_toolkit/io/mongodb/adapter.py +246 -0
- cratedb_toolkit/io/mongodb/api.py +209 -0
- cratedb_toolkit/io/mongodb/cdc.py +130 -0
- cratedb_toolkit/io/mongodb/cli.py +112 -0
- cratedb_toolkit/io/mongodb/copy.py +107 -0
- cratedb_toolkit/io/mongodb/core.py +129 -0
- cratedb_toolkit/io/mongodb/export.py +72 -0
- cratedb_toolkit/io/mongodb/extract.py +208 -0
- cratedb_toolkit/io/mongodb/model.py +4 -0
- cratedb_toolkit/io/mongodb/transform.py +67 -0
- cratedb_toolkit/io/mongodb/translate.py +191 -0
- cratedb_toolkit/io/mongodb/util.py +64 -0
- cratedb_toolkit/io/processor/__init__.py +0 -0
- cratedb_toolkit/io/processor/kinesis_lambda.py +159 -0
- cratedb_toolkit/io/sql.py +1 -0
- cratedb_toolkit/job/__init__.py +0 -0
- cratedb_toolkit/job/cli.py +30 -0
- cratedb_toolkit/job/croud.py +17 -0
- cratedb_toolkit/model.py +197 -0
- cratedb_toolkit/options.py +11 -0
- cratedb_toolkit/query/__init__.py +0 -0
- cratedb_toolkit/query/cli.py +26 -0
- cratedb_toolkit/query/convert/__init__.py +0 -0
- cratedb_toolkit/query/convert/basic.py +24 -0
- cratedb_toolkit/query/convert/cli.py +43 -0
- cratedb_toolkit/query/mcp/__init__.py +0 -0
- cratedb_toolkit/query/mcp/cli.py +92 -0
- cratedb_toolkit/query/mcp/inquiry.py +133 -0
- cratedb_toolkit/query/mcp/model.py +144 -0
- cratedb_toolkit/query/mcp/pg_mcp.py +8 -0
- cratedb_toolkit/query/mcp/registry.py +213 -0
- cratedb_toolkit/query/mcp/util.py +97 -0
- cratedb_toolkit/retention/__init__.py +0 -0
- cratedb_toolkit/retention/cli.py +320 -0
- cratedb_toolkit/retention/core.py +209 -0
- cratedb_toolkit/retention/model.py +192 -0
- cratedb_toolkit/retention/setup/__init__.py +0 -0
- cratedb_toolkit/retention/setup/schema.py +35 -0
- cratedb_toolkit/retention/setup/schema.sql +34 -0
- cratedb_toolkit/retention/store.py +252 -0
- cratedb_toolkit/retention/strategy/__init__.py +0 -0
- cratedb_toolkit/retention/strategy/delete.py +37 -0
- cratedb_toolkit/retention/strategy/reallocate.py +65 -0
- cratedb_toolkit/retention/strategy/snapshot.py +41 -0
- cratedb_toolkit/shell/__init__.py +0 -0
- cratedb_toolkit/shell/cli.py +78 -0
- cratedb_toolkit/testing/__init__.py +0 -0
- cratedb_toolkit/testing/pytest.py +27 -0
- cratedb_toolkit/testing/testcontainers/__init__.py +0 -0
- cratedb_toolkit/testing/testcontainers/azurite.py +74 -0
- cratedb_toolkit/testing/testcontainers/cratedb.py +216 -0
- cratedb_toolkit/testing/testcontainers/influxdb2.py +83 -0
- cratedb_toolkit/testing/testcontainers/localstack.py +41 -0
- cratedb_toolkit/testing/testcontainers/minio.py +46 -0
- cratedb_toolkit/testing/testcontainers/mongodb.py +141 -0
- cratedb_toolkit/testing/testcontainers/util.py +126 -0
- cratedb_toolkit/util/__init__.py +0 -0
- cratedb_toolkit/util/cli.py +133 -0
- cratedb_toolkit/util/common.py +44 -0
- cratedb_toolkit/util/config.py +55 -0
- cratedb_toolkit/util/cr8.py +8 -0
- cratedb_toolkit/util/crash.py +37 -0
- cratedb_toolkit/util/croud.py +170 -0
- cratedb_toolkit/util/data.py +54 -0
- cratedb_toolkit/util/data_dict.py +165 -0
- cratedb_toolkit/util/database.py +438 -0
- cratedb_toolkit/util/date.py +13 -0
- cratedb_toolkit/util/format.py +88 -0
- cratedb_toolkit/util/io.py +15 -0
- cratedb_toolkit/util/pandas.py +83 -0
- cratedb_toolkit/util/platform.py +56 -0
- cratedb_toolkit/util/process.py +23 -0
- cratedb_toolkit/util/service.py +24 -0
- cratedb_toolkit/util/sqlalchemy.py +16 -0
- cratedb_toolkit-0.0.0.dist-info/METADATA +275 -0
- cratedb_toolkit-0.0.0.dist-info/RECORD +153 -0
- cratedb_toolkit-0.0.0.dist-info/WHEEL +5 -0
- cratedb_toolkit-0.0.0.dist-info/entry_points.txt +9 -0
- cratedb_toolkit-0.0.0.dist-info/licenses/LICENSE +619 -0
- cratedb_toolkit-0.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
try:
|
|
2
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
3
|
+
except (ImportError, ModuleNotFoundError): # pragma:nocover
|
|
4
|
+
from importlib_metadata import PackageNotFoundError, version # type: ignore[assignment,no-redef,unused-ignore]
|
|
5
|
+
|
|
6
|
+
__appname__ = "cratedb-toolkit"
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
__version__ = version(__appname__)
|
|
10
|
+
except PackageNotFoundError: # pragma: no cover
|
|
11
|
+
__version__ = "unknown"
|
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .api import PyMongoCrateDBAdapter # noqa: F401
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
from unittest.mock import patch
|
|
2
|
+
|
|
3
|
+
import pymongo.collection
|
|
4
|
+
|
|
5
|
+
from cratedb_toolkit.adapter.pymongo.collection import collection_factory
|
|
6
|
+
from cratedb_toolkit.util.database import DatabaseAdapter
|
|
7
|
+
from cratedb_toolkit.util.pandas import patch_pandas_sqltable_with_extended_mapping
|
|
8
|
+
from cratedb_toolkit.util.sqlalchemy import patch_types_map
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class PyMongoCrateDBAdapter:
|
|
12
|
+
"""
|
|
13
|
+
Patch PyMongo to talk to CrateDB.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(self, dburi: str):
|
|
17
|
+
self.cratedb = DatabaseAdapter(dburi=dburi)
|
|
18
|
+
self.collection_backup = pymongo.collection.Collection
|
|
19
|
+
|
|
20
|
+
collection_patched = collection_factory(cratedb=self.cratedb) # type: ignore[misc]
|
|
21
|
+
self.patches = [
|
|
22
|
+
# Patch PyMongo's `Collection` implementation.
|
|
23
|
+
patch("pymongo.collection.Collection", collection_patched),
|
|
24
|
+
patch("pymongo.database.Collection", collection_patched),
|
|
25
|
+
# Converge a few low-level functions of PyMongo to no-ops.
|
|
26
|
+
patch("pymongo.mongo_client.MongoClient._ensure_session"),
|
|
27
|
+
patch("pymongo.mongo_client._ClientConnectionRetryable._get_server"),
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
def start(self):
|
|
31
|
+
self.adjust_sqlalchemy()
|
|
32
|
+
self.activate()
|
|
33
|
+
|
|
34
|
+
def stop(self):
|
|
35
|
+
self.deactivate()
|
|
36
|
+
|
|
37
|
+
def __enter__(self):
|
|
38
|
+
self.start()
|
|
39
|
+
|
|
40
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
41
|
+
self.stop()
|
|
42
|
+
|
|
43
|
+
def adjust_sqlalchemy(self):
|
|
44
|
+
"""
|
|
45
|
+
Configure CrateDB SQLAlchemy dialect.
|
|
46
|
+
|
|
47
|
+
Setting the CrateDB column policy to `dynamic` means that new columns
|
|
48
|
+
can be added without needing to explicitly change the table definition
|
|
49
|
+
by running corresponding `ALTER TABLE` statements.
|
|
50
|
+
|
|
51
|
+
https://cratedb.com/docs/crate/reference/en/latest/general/ddl/column-policy.html#dynamic
|
|
52
|
+
"""
|
|
53
|
+
# 1. Patch data types for CrateDB dialect.
|
|
54
|
+
# TODO: Upstream to `sqlalchemy-cratedb`.
|
|
55
|
+
patch_types_map()
|
|
56
|
+
|
|
57
|
+
# 2. Prepare pandas.
|
|
58
|
+
# TODO: Provide unpatching hook.
|
|
59
|
+
# TODO: Use `with table_kwargs(...)`.
|
|
60
|
+
from cratedb_toolkit.util.pandas import patch_pandas_sqltable_with_dialect_parameters
|
|
61
|
+
|
|
62
|
+
patch_pandas_sqltable_with_dialect_parameters(table_kwargs={"crate_column_policy": "'dynamic'"})
|
|
63
|
+
patch_pandas_sqltable_with_extended_mapping()
|
|
64
|
+
|
|
65
|
+
def activate(self):
|
|
66
|
+
"""
|
|
67
|
+
Swap in the MongoDB -> CrateDB adapter, by patching functions in PyMongo.
|
|
68
|
+
"""
|
|
69
|
+
for patch_ in self.patches:
|
|
70
|
+
patch_.start()
|
|
71
|
+
|
|
72
|
+
def deactivate(self):
|
|
73
|
+
"""
|
|
74
|
+
Swap out the MongoDB -> CrateDB adapter, by restoring patched functions.
|
|
75
|
+
"""
|
|
76
|
+
for patch_ in self.patches:
|
|
77
|
+
patch_.stop()
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
# Make Python 3.7 and 3.8 support generic types like `dict` instead of `typing.Dict`.
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import io
|
|
5
|
+
import logging
|
|
6
|
+
from collections import abc
|
|
7
|
+
from typing import Any, Iterable, Iterator, Mapping, Optional, Union
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from bson.raw_bson import RawBSONDocument
|
|
11
|
+
from pymongo import common
|
|
12
|
+
from pymongo.client_session import ClientSession
|
|
13
|
+
from pymongo.collection import Collection
|
|
14
|
+
from pymongo.cursor import Cursor
|
|
15
|
+
from pymongo.results import InsertManyResult, InsertOneResult
|
|
16
|
+
from pymongo.typings import _DocumentType
|
|
17
|
+
from sqlalchemy_cratedb.support import insert_bulk
|
|
18
|
+
|
|
19
|
+
from cratedb_toolkit.adapter.pymongo.cursor import cursor_factory
|
|
20
|
+
from cratedb_toolkit.adapter.pymongo.util import AmendedObjectId as ObjectId
|
|
21
|
+
from cratedb_toolkit.util.database import DatabaseAdapter
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def collection_factory(cratedb: DatabaseAdapter):
|
|
27
|
+
class AmendedCollection(Collection):
|
|
28
|
+
def find(self: Collection, *args: Any, **kwargs: Any) -> Cursor[_DocumentType]:
|
|
29
|
+
AmendedCursor = cursor_factory(cratedb=cratedb)
|
|
30
|
+
return AmendedCursor(self, *args, **kwargs)
|
|
31
|
+
|
|
32
|
+
def count_documents(
|
|
33
|
+
self: Collection,
|
|
34
|
+
filter: Mapping[str, Any], # noqa: A002
|
|
35
|
+
session: Optional[ClientSession] = None,
|
|
36
|
+
comment: Optional[Any] = None,
|
|
37
|
+
**kwargs: Any,
|
|
38
|
+
) -> int:
|
|
39
|
+
"""
|
|
40
|
+
TODO: Make it more efficient.
|
|
41
|
+
"""
|
|
42
|
+
filter = filter or {} # noqa: A001
|
|
43
|
+
return len(list(self.find(filter=filter, session=session, comment=comment, **kwargs)))
|
|
44
|
+
|
|
45
|
+
@staticmethod
|
|
46
|
+
def get_df_info(df: pd.DataFrame) -> str:
|
|
47
|
+
buffer = io.StringIO()
|
|
48
|
+
df.info(buf=buffer)
|
|
49
|
+
buffer.seek(0)
|
|
50
|
+
return buffer.read()
|
|
51
|
+
|
|
52
|
+
def insert_one(
|
|
53
|
+
self: Collection,
|
|
54
|
+
document: Union[_DocumentType, RawBSONDocument],
|
|
55
|
+
bypass_document_validation: bool = False,
|
|
56
|
+
session: Optional[ClientSession] = None,
|
|
57
|
+
comment: Optional[Any] = None,
|
|
58
|
+
) -> InsertOneResult:
|
|
59
|
+
logger.debug(f"Reading document: {document}")
|
|
60
|
+
data = pd.DataFrame.from_records([document])
|
|
61
|
+
# logger.debug(f"Dataframe: {self.get_df_info(data)}, {data.tail()}") # noqa: ERA001
|
|
62
|
+
logger.debug(f"Inserting record into CrateDB: schema={self.database.name}, table={self.name}")
|
|
63
|
+
|
|
64
|
+
object_id_cratedb: Optional[str] = None
|
|
65
|
+
|
|
66
|
+
def insert_returning_id(pd_table, conn, keys, data_iter):
|
|
67
|
+
"""
|
|
68
|
+
Use CrateDB's "bulk operations" endpoint as a fast path for pandas' and Dask's `to_sql()` [1] method.
|
|
69
|
+
|
|
70
|
+
The idea is to break out of SQLAlchemy, compile the insert statement, and use the raw
|
|
71
|
+
DBAPI connection client, in order to be able to amend the SQL statement, adding a
|
|
72
|
+
`RETURNING _id` clause.
|
|
73
|
+
|
|
74
|
+
The vanilla implementation, used by SQLAlchemy, is::
|
|
75
|
+
|
|
76
|
+
data = [dict(zip(keys, row)) for row in data_iter]
|
|
77
|
+
conn.execute(pd_table.table.insert(), data)
|
|
78
|
+
"""
|
|
79
|
+
nonlocal object_id_cratedb
|
|
80
|
+
|
|
81
|
+
# Compile SQL statement and materialize batch.
|
|
82
|
+
sql = str(pd_table.table.insert().compile(bind=conn))
|
|
83
|
+
data = list(data_iter)
|
|
84
|
+
|
|
85
|
+
# Invoke amended insert operation, returning the record
|
|
86
|
+
# identifier as surrogate to MongoDB's `ObjectId`.
|
|
87
|
+
cursor = conn._dbapi_connection.cursor()
|
|
88
|
+
cursor.execute(sql=sql + " RETURNING _id", parameters=data[0])
|
|
89
|
+
outcome = cursor.fetchone()
|
|
90
|
+
object_id_cratedb = outcome[0]
|
|
91
|
+
cursor.close()
|
|
92
|
+
|
|
93
|
+
# TODO: Either, or?
|
|
94
|
+
data.to_sql(
|
|
95
|
+
name=self.name,
|
|
96
|
+
schema=self.database.name,
|
|
97
|
+
con=cratedb.engine,
|
|
98
|
+
index=False,
|
|
99
|
+
# TODO: Handle `append` vs. `replace`.
|
|
100
|
+
if_exists="append",
|
|
101
|
+
method=insert_returning_id,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
if object_id_cratedb is None:
|
|
105
|
+
raise ValueError("Object may have been created, but there is no object id")
|
|
106
|
+
|
|
107
|
+
object_id_mongodb = ObjectId.from_str(object_id_cratedb)
|
|
108
|
+
logger.debug(f"Created object with id: {object_id_mongodb!r}")
|
|
109
|
+
return InsertOneResult(inserted_id=object_id_mongodb, acknowledged=True)
|
|
110
|
+
|
|
111
|
+
def insert_many(
|
|
112
|
+
self,
|
|
113
|
+
documents: Iterable[Union[_DocumentType, RawBSONDocument]],
|
|
114
|
+
ordered: bool = True,
|
|
115
|
+
bypass_document_validation: bool = False,
|
|
116
|
+
session: Optional[ClientSession] = None,
|
|
117
|
+
comment: Optional[Any] = None,
|
|
118
|
+
) -> InsertManyResult:
|
|
119
|
+
if not isinstance(documents, abc.Iterable) or isinstance(documents, abc.Mapping) or not documents:
|
|
120
|
+
raise TypeError("documents must be a non-empty list")
|
|
121
|
+
inserted_ids: list[ObjectId] = []
|
|
122
|
+
|
|
123
|
+
def gen() -> Iterator[Mapping[str, Any]]:
|
|
124
|
+
"""A generator that validates documents and handles _ids."""
|
|
125
|
+
for document in documents:
|
|
126
|
+
common.validate_is_document_type("document", document)
|
|
127
|
+
if not isinstance(document, RawBSONDocument):
|
|
128
|
+
if "_id" in document:
|
|
129
|
+
identifier = ObjectId(document["_id"])
|
|
130
|
+
else:
|
|
131
|
+
identifier = ObjectId()
|
|
132
|
+
document["_id"] = str(identifier) # type: ignore[index]
|
|
133
|
+
inserted_ids.append(identifier)
|
|
134
|
+
yield document
|
|
135
|
+
|
|
136
|
+
logger.debug("Converting documents")
|
|
137
|
+
documents_real = list(gen())
|
|
138
|
+
|
|
139
|
+
logger.debug(f"Reading documents: {documents_real}")
|
|
140
|
+
data = pd.DataFrame.from_records(documents_real)
|
|
141
|
+
logger.debug(f"Dataframe: {self.get_df_info(data)}, {data.tail()}") # noqa: ERA001
|
|
142
|
+
logger.debug(f"Inserting records into CrateDB: schema={self.database.name}, table={self.name}")
|
|
143
|
+
|
|
144
|
+
data.to_sql(
|
|
145
|
+
name=self.name,
|
|
146
|
+
schema=self.database.name,
|
|
147
|
+
con=cratedb.engine,
|
|
148
|
+
index=False,
|
|
149
|
+
# TODO: Handle `append` vs. `replace`.
|
|
150
|
+
if_exists="append",
|
|
151
|
+
method=insert_bulk,
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
return InsertManyResult(inserted_ids, acknowledged=True)
|
|
155
|
+
|
|
156
|
+
return AmendedCollection
|
|
@@ -0,0 +1,390 @@
|
|
|
1
|
+
# Make Python 3.7 and 3.8 support generic types like `dict` instead of `typing.Dict`.
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import copy
|
|
5
|
+
import logging
|
|
6
|
+
import warnings
|
|
7
|
+
from collections import deque
|
|
8
|
+
from typing import Any, Iterable, Mapping, Optional, Union
|
|
9
|
+
|
|
10
|
+
import sqlalchemy as sa
|
|
11
|
+
from bson import SON
|
|
12
|
+
from pymongo import CursorType, helpers
|
|
13
|
+
from pymongo.client_session import ClientSession
|
|
14
|
+
from pymongo.collation import validate_collation_or_none
|
|
15
|
+
from pymongo.collection import Collection
|
|
16
|
+
from pymongo.common import validate_is_document_type, validate_is_mapping
|
|
17
|
+
from pymongo.cursor import _QUERY_OPTIONS, Cursor, _Hint, _Sort
|
|
18
|
+
from pymongo.errors import InvalidOperation
|
|
19
|
+
from pymongo.message import _GetMore, _Query
|
|
20
|
+
from pymongo.read_preferences import _ServerMode
|
|
21
|
+
from pymongo.typings import _Address, _CollationIn, _DocumentType
|
|
22
|
+
from pymongo.write_concern import validate_boolean
|
|
23
|
+
|
|
24
|
+
from cratedb_toolkit.adapter.pymongo.reactor import mongodb_query, table_to_model
|
|
25
|
+
from cratedb_toolkit.adapter.pymongo.util import AmendedObjectId
|
|
26
|
+
from cratedb_toolkit.util.database import DatabaseAdapter
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def cursor_factory(cratedb: DatabaseAdapter):
|
|
32
|
+
class AmendedCursor(Cursor[_DocumentType]):
|
|
33
|
+
_query_class = _Query
|
|
34
|
+
_getmore_class = _GetMore
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
collection: Collection[_DocumentType],
|
|
39
|
+
filter: Optional[Mapping[str, Any]] = None, # noqa: A002
|
|
40
|
+
projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
|
|
41
|
+
skip: int = 0,
|
|
42
|
+
limit: int = 0,
|
|
43
|
+
no_cursor_timeout: bool = False,
|
|
44
|
+
cursor_type: int = CursorType.NON_TAILABLE,
|
|
45
|
+
sort: Optional[_Sort] = None,
|
|
46
|
+
allow_partial_results: bool = False,
|
|
47
|
+
oplog_replay: bool = False,
|
|
48
|
+
batch_size: int = 0,
|
|
49
|
+
collation: Optional[_CollationIn] = None,
|
|
50
|
+
hint: Optional[_Hint] = None,
|
|
51
|
+
max_scan: Optional[int] = None,
|
|
52
|
+
max_time_ms: Optional[int] = None,
|
|
53
|
+
max: Optional[_Sort] = None, # noqa: A002
|
|
54
|
+
min: Optional[_Sort] = None, # noqa: A002
|
|
55
|
+
return_key: Optional[bool] = None,
|
|
56
|
+
show_record_id: Optional[bool] = None,
|
|
57
|
+
snapshot: Optional[bool] = None,
|
|
58
|
+
comment: Optional[Any] = None,
|
|
59
|
+
session: Optional[ClientSession] = None,
|
|
60
|
+
allow_disk_use: Optional[bool] = None,
|
|
61
|
+
let: Optional[bool] = None,
|
|
62
|
+
) -> None:
|
|
63
|
+
"""Create a new cursor.
|
|
64
|
+
|
|
65
|
+
Should not be called directly by application developers - see
|
|
66
|
+
:meth:`~pymongo.collection.Collection.find` instead.
|
|
67
|
+
|
|
68
|
+
.. seealso:: The MongoDB documentation on `cursors <https://dochub.mongodb.org/core/cursors>`_.
|
|
69
|
+
"""
|
|
70
|
+
# Initialize all attributes used in __del__ before possibly raising
|
|
71
|
+
# an error to avoid attribute errors during garbage collection.
|
|
72
|
+
self.__collection: Collection[_DocumentType] = collection
|
|
73
|
+
self.__id: Any = None
|
|
74
|
+
self.__exhaust = False
|
|
75
|
+
self.__sock_mgr: Any = None
|
|
76
|
+
self.__killed = False
|
|
77
|
+
self.__session: Optional[ClientSession]
|
|
78
|
+
|
|
79
|
+
if session:
|
|
80
|
+
self.__session = session
|
|
81
|
+
self.__explicit_session = True
|
|
82
|
+
else:
|
|
83
|
+
self.__session = None
|
|
84
|
+
self.__explicit_session = False
|
|
85
|
+
|
|
86
|
+
spec: Mapping[str, Any] = filter or {}
|
|
87
|
+
validate_is_mapping("filter", spec)
|
|
88
|
+
if not isinstance(skip, int):
|
|
89
|
+
raise TypeError("skip must be an instance of int")
|
|
90
|
+
if not isinstance(limit, int):
|
|
91
|
+
raise TypeError("limit must be an instance of int")
|
|
92
|
+
validate_boolean("no_cursor_timeout", no_cursor_timeout)
|
|
93
|
+
if no_cursor_timeout and not self.__explicit_session:
|
|
94
|
+
warnings.warn(
|
|
95
|
+
"use an explicit session with no_cursor_timeout=True "
|
|
96
|
+
"otherwise the cursor may still timeout after "
|
|
97
|
+
"30 minutes, for more info see "
|
|
98
|
+
"https://mongodb.com/docs/v4.4/reference/method/"
|
|
99
|
+
"cursor.noCursorTimeout/"
|
|
100
|
+
"#session-idle-timeout-overrides-nocursortimeout",
|
|
101
|
+
UserWarning,
|
|
102
|
+
stacklevel=2,
|
|
103
|
+
)
|
|
104
|
+
if cursor_type not in (
|
|
105
|
+
CursorType.NON_TAILABLE,
|
|
106
|
+
CursorType.TAILABLE,
|
|
107
|
+
CursorType.TAILABLE_AWAIT,
|
|
108
|
+
CursorType.EXHAUST,
|
|
109
|
+
):
|
|
110
|
+
raise ValueError("not a valid value for cursor_type")
|
|
111
|
+
validate_boolean("allow_partial_results", allow_partial_results)
|
|
112
|
+
validate_boolean("oplog_replay", oplog_replay)
|
|
113
|
+
if not isinstance(batch_size, int):
|
|
114
|
+
raise TypeError("batch_size must be an integer")
|
|
115
|
+
if batch_size < 0:
|
|
116
|
+
raise ValueError("batch_size must be >= 0")
|
|
117
|
+
# Only set if allow_disk_use is provided by the user, else None.
|
|
118
|
+
if allow_disk_use is not None:
|
|
119
|
+
allow_disk_use = validate_boolean("allow_disk_use", allow_disk_use)
|
|
120
|
+
|
|
121
|
+
if projection is not None:
|
|
122
|
+
projection = helpers._fields_list_to_dict(projection, "projection")
|
|
123
|
+
|
|
124
|
+
if let is not None:
|
|
125
|
+
validate_is_document_type("let", let)
|
|
126
|
+
|
|
127
|
+
self.__let = let
|
|
128
|
+
self.__spec = spec
|
|
129
|
+
self.__has_filter = filter is not None
|
|
130
|
+
self.__projection = projection
|
|
131
|
+
self.__skip = skip
|
|
132
|
+
self.__limit = limit
|
|
133
|
+
self.__batch_size = batch_size
|
|
134
|
+
self.__ordering = sort and helpers._index_document(sort) or None
|
|
135
|
+
self.__max_scan = max_scan
|
|
136
|
+
self.__explain = False
|
|
137
|
+
self.__comment = comment
|
|
138
|
+
self.__max_time_ms = max_time_ms
|
|
139
|
+
self.__max_await_time_ms: Optional[int] = None
|
|
140
|
+
self.__max: Optional[Union[SON[Any, Any], _Sort]] = max
|
|
141
|
+
self.__min: Optional[Union[SON[Any, Any], _Sort]] = min
|
|
142
|
+
self.__collation = validate_collation_or_none(collation)
|
|
143
|
+
self.__return_key = return_key
|
|
144
|
+
self.__show_record_id = show_record_id
|
|
145
|
+
self.__allow_disk_use = allow_disk_use
|
|
146
|
+
self.__snapshot = snapshot
|
|
147
|
+
self.__hint: Union[str, SON[str, Any], None]
|
|
148
|
+
self.__set_hint(hint)
|
|
149
|
+
|
|
150
|
+
# Exhaust cursor support
|
|
151
|
+
# TODO: Implement.
|
|
152
|
+
"""
|
|
153
|
+
if cursor_type == CursorType.EXHAUST:
|
|
154
|
+
if self.__collection.database.client.is_mongos:
|
|
155
|
+
raise InvalidOperation("Exhaust cursors are not supported by mongos")
|
|
156
|
+
if limit:
|
|
157
|
+
raise InvalidOperation("Can't use limit and exhaust together.")
|
|
158
|
+
self.__exhaust = True
|
|
159
|
+
"""
|
|
160
|
+
|
|
161
|
+
# This is ugly. People want to be able to do cursor[5:5] and
|
|
162
|
+
# get an empty result set (old behavior was an
|
|
163
|
+
# exception). It's hard to do that right, though, because the
|
|
164
|
+
# server uses limit(0) to mean 'no limit'. So we set __empty
|
|
165
|
+
# in that case and check for it when iterating. We also unset
|
|
166
|
+
# it anytime we change __limit.
|
|
167
|
+
self.__empty = False
|
|
168
|
+
|
|
169
|
+
self.__data: deque = deque()
|
|
170
|
+
self.__address: Optional[_Address] = None
|
|
171
|
+
self.__retrieved = 0
|
|
172
|
+
|
|
173
|
+
self.__codec_options = collection.codec_options
|
|
174
|
+
# Read preference is set when the initial find is sent.
|
|
175
|
+
self.__read_preference: Optional[_ServerMode] = None
|
|
176
|
+
self.__read_concern = collection.read_concern
|
|
177
|
+
|
|
178
|
+
self.__query_flags = cursor_type
|
|
179
|
+
if no_cursor_timeout:
|
|
180
|
+
self.__query_flags |= _QUERY_OPTIONS["no_timeout"]
|
|
181
|
+
if allow_partial_results:
|
|
182
|
+
self.__query_flags |= _QUERY_OPTIONS["partial"]
|
|
183
|
+
if oplog_replay:
|
|
184
|
+
self.__query_flags |= _QUERY_OPTIONS["oplog_replay"]
|
|
185
|
+
|
|
186
|
+
# The namespace to use for find/getMore commands.
|
|
187
|
+
self.__dbname = collection.database.name
|
|
188
|
+
self.__collname = collection.name
|
|
189
|
+
|
|
190
|
+
# Hack back the inheritance into the parent class.
|
|
191
|
+
self._synthesize()
|
|
192
|
+
|
|
193
|
+
def _synthesize(self):
|
|
194
|
+
# Hack back the inheritance into the parent class, in order to save code.
|
|
195
|
+
# Otherwise, it will yield errors like `AttributeError: 'AmendedCursor'
|
|
196
|
+
# object has no attribute '_Cursor__explicit_session'`
|
|
197
|
+
attrs = self.__dict__
|
|
198
|
+
for name in list(attrs.keys()):
|
|
199
|
+
if not name.startswith("_AmendedCursor"):
|
|
200
|
+
continue
|
|
201
|
+
parent_name = name.replace("_AmendedCursor__", "_Cursor__")
|
|
202
|
+
setattr(self, parent_name, getattr(self, name))
|
|
203
|
+
|
|
204
|
+
def update_parent(self):
|
|
205
|
+
self._Cursor__data = self.__data
|
|
206
|
+
|
|
207
|
+
def next(self) -> _DocumentType: # noqa: A002, A003
|
|
208
|
+
"""Advance the cursor."""
|
|
209
|
+
if self.__empty:
|
|
210
|
+
raise StopIteration
|
|
211
|
+
if len(self.__data) or self._refresh():
|
|
212
|
+
return self.__data.popleft()
|
|
213
|
+
else:
|
|
214
|
+
raise StopIteration
|
|
215
|
+
|
|
216
|
+
__next__ = next
|
|
217
|
+
|
|
218
|
+
def __enter__(self) -> Cursor[_DocumentType]:
|
|
219
|
+
return self
|
|
220
|
+
|
|
221
|
+
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
|
222
|
+
self.close()
|
|
223
|
+
|
|
224
|
+
def _refresh(self) -> int:
|
|
225
|
+
"""Refreshes the cursor with more data from Mongo.
|
|
226
|
+
|
|
227
|
+
Returns the length of self.__data after refresh. Will exit early if
|
|
228
|
+
self.__data is already non-empty. Raises OperationFailure when the
|
|
229
|
+
cursor cannot be refreshed due to an error on the query.
|
|
230
|
+
"""
|
|
231
|
+
if len(self.__data) or self.__killed:
|
|
232
|
+
return len(self.__data)
|
|
233
|
+
|
|
234
|
+
if not self.__session:
|
|
235
|
+
self.__session = self.__collection.database.client._ensure_session()
|
|
236
|
+
|
|
237
|
+
if self.__id is None: # Query
|
|
238
|
+
if (self.__min or self.__max) and not self.__hint:
|
|
239
|
+
raise InvalidOperation(
|
|
240
|
+
"Passing a 'hint' is required when using the min/max query"
|
|
241
|
+
" option to ensure the query utilizes the correct index"
|
|
242
|
+
)
|
|
243
|
+
q = self._query_class(
|
|
244
|
+
self.__query_flags,
|
|
245
|
+
self.__collection.database.name,
|
|
246
|
+
self.__collection.name,
|
|
247
|
+
self.__skip,
|
|
248
|
+
self.__query_spec(),
|
|
249
|
+
self.__projection,
|
|
250
|
+
self.__codec_options,
|
|
251
|
+
self._read_preference(),
|
|
252
|
+
self.__limit,
|
|
253
|
+
self.__batch_size,
|
|
254
|
+
self.__read_concern,
|
|
255
|
+
self.__collation,
|
|
256
|
+
self.__session,
|
|
257
|
+
self.__collection.database.client,
|
|
258
|
+
self.__allow_disk_use,
|
|
259
|
+
self.__exhaust,
|
|
260
|
+
)
|
|
261
|
+
self.__send_message(q)
|
|
262
|
+
elif self.__id: # Get More
|
|
263
|
+
if self.__limit:
|
|
264
|
+
limit = self.__limit - self.__retrieved
|
|
265
|
+
if self.__batch_size:
|
|
266
|
+
limit = min(limit, self.__batch_size)
|
|
267
|
+
else:
|
|
268
|
+
limit = self.__batch_size
|
|
269
|
+
# Exhaust cursors don't send getMore messages.
|
|
270
|
+
g = self._getmore_class(
|
|
271
|
+
self.__dbname,
|
|
272
|
+
self.__collname,
|
|
273
|
+
limit,
|
|
274
|
+
self.__id,
|
|
275
|
+
self.__codec_options,
|
|
276
|
+
self._read_preference(),
|
|
277
|
+
self.__session,
|
|
278
|
+
self.__collection.database.client,
|
|
279
|
+
self.__max_await_time_ms,
|
|
280
|
+
self.__sock_mgr,
|
|
281
|
+
self.__exhaust,
|
|
282
|
+
self.__comment,
|
|
283
|
+
)
|
|
284
|
+
self.__send_message(g)
|
|
285
|
+
|
|
286
|
+
return len(self.__data)
|
|
287
|
+
|
|
288
|
+
def sort(self, key_or_list: _Hint, direction: Optional[Union[int, str]] = None) -> Cursor[_DocumentType]:
|
|
289
|
+
""" """
|
|
290
|
+
keys = helpers._index_list(key_or_list, direction)
|
|
291
|
+
self.__ordering = helpers._index_document(keys)
|
|
292
|
+
return self
|
|
293
|
+
|
|
294
|
+
def __send_message(self, operation: Union[_Query, _GetMore]) -> None:
|
|
295
|
+
"""
|
|
296
|
+
Usually sends a query or getmore operation and handles the response to/from a MongoDB server.
|
|
297
|
+
Here, it will build an SQL query from the `operation`s metadata, and will have a conversation
|
|
298
|
+
with a CrateDB server instead.
|
|
299
|
+
|
|
300
|
+
TODO: OperationFailure / self.close() / PinnedResponse / explain / batching
|
|
301
|
+
"""
|
|
302
|
+
metadata = sa.MetaData(schema=operation.db)
|
|
303
|
+
table_name = operation.coll
|
|
304
|
+
|
|
305
|
+
table = sa.Table(table_name, metadata, autoload_with=cratedb.engine)
|
|
306
|
+
table.append_column(sa.Column("_id", sa.String(), primary_key=True, system=True))
|
|
307
|
+
model = table_to_model(table)
|
|
308
|
+
|
|
309
|
+
query = mongodb_query(
|
|
310
|
+
model=model,
|
|
311
|
+
filter=dict(self.__spec) or {},
|
|
312
|
+
sort=self.__ordering and list(self.__ordering) or ["_id"],
|
|
313
|
+
)
|
|
314
|
+
records = query.fetchall(cratedb.connection)
|
|
315
|
+
for record in records:
|
|
316
|
+
record["_id"] = AmendedObjectId.from_str(record["_id"])
|
|
317
|
+
self.__data = deque(records)
|
|
318
|
+
self.__retrieved += len(records)
|
|
319
|
+
self.__id = 0
|
|
320
|
+
|
|
321
|
+
# Needed when manipulating `self.__data`, to synchronize
|
|
322
|
+
# with the `Cursor` parent class.
|
|
323
|
+
self.update_parent()
|
|
324
|
+
|
|
325
|
+
def __query_spec(self) -> Mapping[str, Any]:
|
|
326
|
+
"""Get the spec to use for a query."""
|
|
327
|
+
operators: dict[str, Any] = {}
|
|
328
|
+
if self.__ordering:
|
|
329
|
+
operators["$orderby"] = self.__ordering
|
|
330
|
+
if self.__explain:
|
|
331
|
+
operators["$explain"] = True
|
|
332
|
+
if self.__hint:
|
|
333
|
+
operators["$hint"] = self.__hint
|
|
334
|
+
if self.__let:
|
|
335
|
+
operators["let"] = self.__let
|
|
336
|
+
if self.__comment:
|
|
337
|
+
operators["$comment"] = self.__comment
|
|
338
|
+
if self.__max_scan:
|
|
339
|
+
operators["$maxScan"] = self.__max_scan
|
|
340
|
+
if self.__max_time_ms is not None:
|
|
341
|
+
operators["$maxTimeMS"] = self.__max_time_ms
|
|
342
|
+
if self.__max:
|
|
343
|
+
operators["$max"] = self.__max
|
|
344
|
+
if self.__min:
|
|
345
|
+
operators["$min"] = self.__min
|
|
346
|
+
if self.__return_key is not None:
|
|
347
|
+
operators["$returnKey"] = self.__return_key
|
|
348
|
+
if self.__show_record_id is not None:
|
|
349
|
+
# This is upgraded to showRecordId for MongoDB 3.2+ "find" command.
|
|
350
|
+
operators["$showDiskLoc"] = self.__show_record_id
|
|
351
|
+
if self.__snapshot is not None:
|
|
352
|
+
operators["$snapshot"] = self.__snapshot
|
|
353
|
+
|
|
354
|
+
if operators:
|
|
355
|
+
# Make a shallow copy so we can cleanly rewind or clone.
|
|
356
|
+
spec = copy.copy(self.__spec)
|
|
357
|
+
|
|
358
|
+
# Allow-listed commands must be wrapped in $query.
|
|
359
|
+
if "$query" not in spec:
|
|
360
|
+
# $query has to come first
|
|
361
|
+
spec = SON([("$query", spec)])
|
|
362
|
+
|
|
363
|
+
if not isinstance(spec, SON):
|
|
364
|
+
# Ensure the spec is SON. As order is important this will
|
|
365
|
+
# ensure its set before merging in any extra operators.
|
|
366
|
+
spec = SON(spec)
|
|
367
|
+
|
|
368
|
+
spec.update(operators)
|
|
369
|
+
return spec
|
|
370
|
+
# Have to wrap with $query if "query" is the first key.
|
|
371
|
+
# We can't just use $query anytime "query" is a key as
|
|
372
|
+
# that breaks commands like count and find_and_modify.
|
|
373
|
+
# Checking spec.keys()[0] covers the case that the spec
|
|
374
|
+
# was passed as an instance of SON or OrderedDict.
|
|
375
|
+
elif "query" in self.__spec and (len(self.__spec) == 1 or next(iter(self.__spec)) == "query"):
|
|
376
|
+
return SON({"$query": self.__spec})
|
|
377
|
+
|
|
378
|
+
return self.__spec
|
|
379
|
+
|
|
380
|
+
def __set_hint(self, index: Optional[_Hint]) -> None:
|
|
381
|
+
if index is None:
|
|
382
|
+
self.__hint = None
|
|
383
|
+
return
|
|
384
|
+
|
|
385
|
+
if isinstance(index, str):
|
|
386
|
+
self.__hint = index
|
|
387
|
+
else:
|
|
388
|
+
self.__hint = SON(helpers._index_document(index))
|
|
389
|
+
|
|
390
|
+
return AmendedCursor
|