ingestr 0.1.4__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ingestr might be problematic. Click here for more details.
- {ingestr-0.1.4 → ingestr-0.2.1}/PKG-INFO +3 -1
- {ingestr-0.1.4 → ingestr-0.2.1}/README.md +1 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/docs/.vitepress/config.mjs +2 -1
- ingestr-0.2.1/docs/supported-sources/mongodb.md +24 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/docs/supported-sources/overview.md +1 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/src/factory.py +3 -1
- ingestr-0.2.1/ingestr/src/factory_test.py +13 -0
- ingestr-0.2.1/ingestr/src/mongodb/__init__.py +103 -0
- ingestr-0.2.1/ingestr/src/mongodb/helpers.py +166 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/src/sources.py +38 -1
- ingestr-0.2.1/ingestr/src/sources_test.py +102 -0
- ingestr-0.2.1/ingestr/src/version.py +1 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/testdata/test_create_replace.db +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/testdata/test_delete_insert_with_timerange.db +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/testdata/test_delete_insert_without_primary_key.db +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/testdata/test_merge_with_primary_key.db +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/requirements.txt +1 -0
- ingestr-0.1.4/ingestr/src/sources_test.py +0 -54
- ingestr-0.1.4/ingestr/src/version.py +0 -1
- {ingestr-0.1.4 → ingestr-0.2.1}/.dockerignore +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/.github/workflows/deploy-docs.yml +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/.github/workflows/docker.yml +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/.gitignore +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/Dockerfile +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/LICENSE.md +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/Makefile +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/docs/.vitepress/theme/custom.css +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/docs/.vitepress/theme/index.js +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/docs/commands/example-uris.md +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/docs/commands/ingest.md +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/docs/getting-started/core-concepts.md +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/docs/getting-started/incremental-loading.md +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/docs/getting-started/quickstart.md +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/docs/getting-started/telemetry.md +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/docs/index.md +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/docs/supported-sources/bigquery.md +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/docs/supported-sources/csv.md +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/docs/supported-sources/databricks.md +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/docs/supported-sources/duckdb.md +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/docs/supported-sources/mssql.md +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/docs/supported-sources/mysql.md +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/docs/supported-sources/oracle.md +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/docs/supported-sources/postgres.md +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/docs/supported-sources/redshift.md +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/docs/supported-sources/snowflake.md +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/docs/supported-sources/sqlite.md +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/main.py +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/main_test.py +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/src/destinations.py +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/src/destinations_test.py +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/src/sql_database/__init__.py +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/src/sql_database/helpers.py +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/src/sql_database/schema_types.py +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/src/sql_database/settings.py +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/src/telemetry/event.py +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/src/testdata/fakebqcredentials.json +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/testdata/.gitignore +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/package-lock.json +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/package.json +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/pyproject.toml +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/requirements-dev.txt +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/resources/demo.gif +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/resources/demo.tape +0 -0
- {ingestr-0.1.4 → ingestr-0.2.1}/resources/ingestr.svg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: ingestr
|
|
3
|
-
Version: 0.1
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
|
|
5
5
|
Project-URL: Homepage, https://github.com/bruin-data/ingestr
|
|
6
6
|
Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
|
|
@@ -23,6 +23,7 @@ Requires-Dist: google-cloud-bigquery-storage==2.24.0
|
|
|
23
23
|
Requires-Dist: pendulum==3.0.0
|
|
24
24
|
Requires-Dist: psycopg2-binary==2.9.9
|
|
25
25
|
Requires-Dist: py-machineid==0.5.1
|
|
26
|
+
Requires-Dist: pymongo==4.6.2
|
|
26
27
|
Requires-Dist: pymysql==1.1.0
|
|
27
28
|
Requires-Dist: pyodbc==5.1.0
|
|
28
29
|
Requires-Dist: redshift-connector==2.1.0
|
|
@@ -100,6 +101,7 @@ Join our Slack community [here](https://join.slack.com/t/bruindatacommunity/shar
|
|
|
100
101
|
| DuckDB | ✅ | ✅ |
|
|
101
102
|
| Microsoft SQL Server | ✅ | ✅ |
|
|
102
103
|
| Local CSV file | ✅ | ✅ |
|
|
104
|
+
| MongoDB | ✅ | ❌ |
|
|
103
105
|
| Oracle | ✅ | ❌ |
|
|
104
106
|
| SQLite | ✅ | ❌ |
|
|
105
107
|
| MySQL | ✅ | ❌ |
|
|
@@ -46,15 +46,16 @@ export default defineConfig({
|
|
|
46
46
|
{
|
|
47
47
|
text: "Sources & Destinations",
|
|
48
48
|
items: [
|
|
49
|
-
{ text: "Overview", link: "/supported-sources/overview.md" },
|
|
50
49
|
{ text: "AWS Redshift", link: "/supported-sources/redshift.md" },
|
|
51
50
|
{ text: "Databricks", link: "/supported-sources/databricks.md" },
|
|
52
51
|
{ text: "DuckDB", link: "/supported-sources/duckdb.md" },
|
|
53
52
|
{ text: "Google BigQuery", link: "/supported-sources/bigquery.md" },
|
|
54
53
|
{ text: "Local CSV Files", link: "/supported-sources/csv.md" },
|
|
55
54
|
{ text: "Microsoft SQL Server", link: "/supported-sources/mssql.md" },
|
|
55
|
+
{ text: "MongoDB", link: "/supported-sources/mongodb.md" },
|
|
56
56
|
{ text: "MySQL", link: "/supported-sources/mysql.md" },
|
|
57
57
|
{ text: "Oracle", link: "/supported-sources/oracle.md" },
|
|
58
|
+
{ text: "Overview", link: "/supported-sources/overview.md" },
|
|
58
59
|
{ text: "Postgres", link: "/supported-sources/postgres.md" },
|
|
59
60
|
{ text: "Snowflake", link: "/supported-sources/snowflake.md" },
|
|
60
61
|
{ text: "SQLite", link: "/supported-sources/sqlite.md" },
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# MongoDB
|
|
2
|
+
MongoDB is a popular, open-source NoSQL database known for its flexibility, scalability, and wide adoption in a variety of applications.
|
|
3
|
+
|
|
4
|
+
ingestr supports MongoDB as a source.
|
|
5
|
+
|
|
6
|
+
## URI Format
|
|
7
|
+
The URI format for MongoDB is as follows:
|
|
8
|
+
|
|
9
|
+
```plaintext
|
|
10
|
+
mongodb://user:password@host:port
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
URI parameters:
|
|
14
|
+
- `user`: the user name to connect to the database
|
|
15
|
+
- `password`: the password for the user
|
|
16
|
+
- `host`: the host address of the database server
|
|
17
|
+
- `port`: the port number the database server is listening on (default is 27017 for MongoDB)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
> [!CAUTION]
|
|
21
|
+
> Do not put the database name at the end of the URI for MongoDB, instead make it a part of `--source-table` option as `database.collection` format.
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
You can read more about MongoDB's connection string format [here](https://docs.mongodb.com/manual/reference/connection-string/).
|
|
@@ -13,7 +13,7 @@ from ingestr.src.destinations import (
|
|
|
13
13
|
RedshiftDestination,
|
|
14
14
|
SnowflakeDestination,
|
|
15
15
|
)
|
|
16
|
-
from ingestr.src.sources import LocalCsvSource, SqlSource
|
|
16
|
+
from ingestr.src.sources import LocalCsvSource, MongoDbSource, SqlSource
|
|
17
17
|
|
|
18
18
|
SQL_SOURCE_SCHEMES = [
|
|
19
19
|
"bigquery",
|
|
@@ -77,6 +77,8 @@ class SourceDestinationFactory:
|
|
|
77
77
|
return SqlSource()
|
|
78
78
|
elif self.source_scheme == "csv":
|
|
79
79
|
return LocalCsvSource()
|
|
80
|
+
elif self.source_scheme == "mongodb":
|
|
81
|
+
return MongoDbSource()
|
|
80
82
|
else:
|
|
81
83
|
raise ValueError(f"Unsupported source scheme: {self.source_scheme}")
|
|
82
84
|
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from ingestr.src.factory import parse_scheme_from_uri
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def test_scheme_is_parsed_from_uri_correctly():
|
|
5
|
+
assert parse_scheme_from_uri("bigquery://my-project") == "bigquery"
|
|
6
|
+
assert parse_scheme_from_uri("http://localhost:8080") == "http"
|
|
7
|
+
assert parse_scheme_from_uri("file:///tmp/myfile") == "file"
|
|
8
|
+
assert parse_scheme_from_uri("https://example.com?query=123") == "https"
|
|
9
|
+
assert parse_scheme_from_uri("ftp://ftp.example.com/downloads/file.zip") == "ftp"
|
|
10
|
+
assert (
|
|
11
|
+
parse_scheme_from_uri("redshift+psycopg2://user:pw@host") == "redshift+psycopg2"
|
|
12
|
+
)
|
|
13
|
+
assert parse_scheme_from_uri("mysql+pymysql://user:pw@host") == "mysql+pymysql"
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""Source that loads collections form any a mongo database, supports incremental loads."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Iterable, List, Optional
|
|
4
|
+
|
|
5
|
+
import dlt
|
|
6
|
+
from dlt.sources import DltResource
|
|
7
|
+
|
|
8
|
+
from .helpers import (
|
|
9
|
+
MongoDbCollectionConfiguration,
|
|
10
|
+
MongoDbCollectionResourceConfiguration,
|
|
11
|
+
client_from_credentials,
|
|
12
|
+
collection_documents,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dlt.source
|
|
17
|
+
def mongodb(
|
|
18
|
+
connection_url: str = dlt.secrets.value,
|
|
19
|
+
database: Optional[str] = dlt.config.value,
|
|
20
|
+
collection_names: Optional[List[str]] = dlt.config.value,
|
|
21
|
+
incremental: Optional[dlt.sources.incremental] = None, # type: ignore[type-arg]
|
|
22
|
+
write_disposition: Optional[str] = dlt.config.value,
|
|
23
|
+
parallel: Optional[bool] = dlt.config.value,
|
|
24
|
+
) -> Iterable[DltResource]:
|
|
25
|
+
"""
|
|
26
|
+
A DLT source which loads data from a mongo database using PyMongo.
|
|
27
|
+
Resources are automatically created for each collection in the database or from the given list of collection.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
connection_url (str): Database connection_url.
|
|
31
|
+
database (Optional[str]): Selected database name, it will use the default database if not passed.
|
|
32
|
+
collection_names (Optional[List[str]]): The list of collections `pymongo.collection.Collection` to load.
|
|
33
|
+
incremental (Optional[dlt.sources.incremental]): Option to enable incremental loading for the collection.
|
|
34
|
+
E.g., `incremental=dlt.sources.incremental('updated_at', pendulum.parse('2022-01-01T00:00:00Z'))`
|
|
35
|
+
write_disposition (str): Write disposition of the resource.
|
|
36
|
+
parallel (Optional[bool]): Option to enable parallel loading for the collection. Default is False.
|
|
37
|
+
Returns:
|
|
38
|
+
Iterable[DltResource]: A list of DLT resources for each collection to be loaded.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
# set up mongo client
|
|
42
|
+
client = client_from_credentials(connection_url)
|
|
43
|
+
if not database:
|
|
44
|
+
mongo_database = client.get_default_database()
|
|
45
|
+
else:
|
|
46
|
+
mongo_database = client[database]
|
|
47
|
+
|
|
48
|
+
# use provided collection or all conllections
|
|
49
|
+
if not collection_names:
|
|
50
|
+
collection_names = mongo_database.list_collection_names()
|
|
51
|
+
|
|
52
|
+
collection_list = [mongo_database[collection] for collection in collection_names]
|
|
53
|
+
|
|
54
|
+
for collection in collection_list:
|
|
55
|
+
yield dlt.resource( # type: ignore
|
|
56
|
+
collection_documents,
|
|
57
|
+
name=collection.name,
|
|
58
|
+
primary_key="_id",
|
|
59
|
+
write_disposition=write_disposition,
|
|
60
|
+
spec=MongoDbCollectionConfiguration,
|
|
61
|
+
)(client, collection, incremental=incremental, parallel=parallel)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dlt.common.configuration.with_config(
|
|
65
|
+
sections=("sources", "mongodb"), spec=MongoDbCollectionResourceConfiguration
|
|
66
|
+
)
|
|
67
|
+
def mongodb_collection(
|
|
68
|
+
connection_url: str = dlt.secrets.value,
|
|
69
|
+
database: Optional[str] = dlt.config.value,
|
|
70
|
+
collection: str = dlt.config.value,
|
|
71
|
+
incremental: Optional[dlt.sources.incremental] = None, # type: ignore[type-arg]
|
|
72
|
+
write_disposition: Optional[str] = dlt.config.value,
|
|
73
|
+
parallel: Optional[bool] = dlt.config.value,
|
|
74
|
+
) -> Any:
|
|
75
|
+
"""
|
|
76
|
+
A DLT source which loads a collection from a mongo database using PyMongo.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
connection_url (str): Database connection_url.
|
|
80
|
+
database (Optional[str]): Selected database name, it will use the default database if not passed.
|
|
81
|
+
collection (str): The collection name to load.
|
|
82
|
+
incremental (Optional[dlt.sources.incremental]): Option to enable incremental loading for the collection.
|
|
83
|
+
E.g., `incremental=dlt.sources.incremental('updated_at', pendulum.parse('2022-01-01T00:00:00Z'))`
|
|
84
|
+
write_disposition (str): Write disposition of the resource.
|
|
85
|
+
parallel (Optional[bool]): Option to enable parallel loading for the collection. Default is False.
|
|
86
|
+
Returns:
|
|
87
|
+
Iterable[DltResource]: A list of DLT resources for each collection to be loaded.
|
|
88
|
+
"""
|
|
89
|
+
# set up mongo client
|
|
90
|
+
client = client_from_credentials(connection_url)
|
|
91
|
+
if not database:
|
|
92
|
+
mongo_database = client.get_default_database()
|
|
93
|
+
else:
|
|
94
|
+
mongo_database = client[database]
|
|
95
|
+
|
|
96
|
+
collection_obj = mongo_database[collection]
|
|
97
|
+
|
|
98
|
+
return dlt.resource( # type: ignore
|
|
99
|
+
collection_documents,
|
|
100
|
+
name=collection_obj.name,
|
|
101
|
+
primary_key="_id",
|
|
102
|
+
write_disposition=write_disposition,
|
|
103
|
+
)(client, collection_obj, incremental=incremental, parallel=parallel)
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""Mongo database source helpers"""
|
|
2
|
+
|
|
3
|
+
from itertools import islice
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Tuple
|
|
5
|
+
|
|
6
|
+
import dlt
|
|
7
|
+
from bson.decimal128 import Decimal128
|
|
8
|
+
from bson.objectid import ObjectId
|
|
9
|
+
from dlt.common.configuration.specs import BaseConfiguration, configspec
|
|
10
|
+
from dlt.common.time import ensure_pendulum_datetime
|
|
11
|
+
from dlt.common.typing import TDataItem
|
|
12
|
+
from dlt.common.utils import map_nested_in_place
|
|
13
|
+
from pendulum import _datetime
|
|
14
|
+
from pymongo import ASCENDING, DESCENDING, MongoClient
|
|
15
|
+
from pymongo.collection import Collection
|
|
16
|
+
from pymongo.cursor import Cursor
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
TMongoClient = MongoClient[Any]
|
|
20
|
+
TCollection = Collection[Any] # type: ignore
|
|
21
|
+
TCursor = Cursor[Any]
|
|
22
|
+
else:
|
|
23
|
+
TMongoClient = Any
|
|
24
|
+
TCollection = Any
|
|
25
|
+
TCursor = Any
|
|
26
|
+
|
|
27
|
+
CHUNK_SIZE = 10000
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class CollectionLoader:
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
client: TMongoClient,
|
|
34
|
+
collection: TCollection,
|
|
35
|
+
incremental: Optional[dlt.sources.incremental[Any]] = None,
|
|
36
|
+
) -> None:
|
|
37
|
+
self.client = client
|
|
38
|
+
self.collection = collection
|
|
39
|
+
self.incremental = incremental
|
|
40
|
+
if incremental:
|
|
41
|
+
self.cursor_field = incremental.cursor_path
|
|
42
|
+
self.last_value = incremental.last_value
|
|
43
|
+
else:
|
|
44
|
+
self.cursor_column = None
|
|
45
|
+
self.last_value = None
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def _filter_op(self) -> Dict[str, Any]:
|
|
49
|
+
if not self.incremental or not self.last_value:
|
|
50
|
+
return {}
|
|
51
|
+
if self.incremental.last_value_func is max:
|
|
52
|
+
return {self.cursor_field: {"$gte": self.last_value}}
|
|
53
|
+
elif self.incremental.last_value_func is min:
|
|
54
|
+
return {self.cursor_field: {"$lt": self.last_value}}
|
|
55
|
+
return {}
|
|
56
|
+
|
|
57
|
+
def load_documents(self) -> Iterator[TDataItem]:
|
|
58
|
+
cursor = self.collection.find(self._filter_op)
|
|
59
|
+
while docs_slice := list(islice(cursor, CHUNK_SIZE)):
|
|
60
|
+
yield map_nested_in_place(convert_mongo_objs, docs_slice)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class CollectionLoaderParallell(CollectionLoader):
|
|
64
|
+
@property
|
|
65
|
+
def _sort_op(self) -> List[Optional[Tuple[str, int]]]:
|
|
66
|
+
if not self.incremental or not self.last_value:
|
|
67
|
+
return []
|
|
68
|
+
if self.incremental.last_value_func is max:
|
|
69
|
+
return [(self.cursor_field, ASCENDING)]
|
|
70
|
+
elif self.incremental.last_value_func is min:
|
|
71
|
+
return [(self.cursor_field, DESCENDING)]
|
|
72
|
+
return []
|
|
73
|
+
|
|
74
|
+
def _get_document_count(self) -> int:
|
|
75
|
+
return self.collection.count_documents(filter=self._filter_op)
|
|
76
|
+
|
|
77
|
+
def _create_batches(self) -> List[Dict[str, int]]:
|
|
78
|
+
doc_count = self._get_document_count()
|
|
79
|
+
return [
|
|
80
|
+
dict(skip=sk, limit=CHUNK_SIZE) for sk in range(0, doc_count, CHUNK_SIZE)
|
|
81
|
+
]
|
|
82
|
+
|
|
83
|
+
def _get_cursor(self) -> TCursor:
|
|
84
|
+
cursor = self.collection.find(filter=self._filter_op)
|
|
85
|
+
if self._sort_op:
|
|
86
|
+
cursor = cursor.sort(self._sort_op) # type: ignore
|
|
87
|
+
return cursor
|
|
88
|
+
|
|
89
|
+
@dlt.defer
|
|
90
|
+
def _run_batch(self, cursor: TCursor, batch: Dict[str, int]) -> TDataItem:
|
|
91
|
+
cursor = cursor.clone()
|
|
92
|
+
|
|
93
|
+
data = []
|
|
94
|
+
for document in cursor.skip(batch["skip"]).limit(batch["limit"]):
|
|
95
|
+
data.append(map_nested_in_place(convert_mongo_objs, document))
|
|
96
|
+
return data
|
|
97
|
+
|
|
98
|
+
def _get_all_batches(self) -> Iterator[TDataItem]:
|
|
99
|
+
batches = self._create_batches()
|
|
100
|
+
cursor = self._get_cursor()
|
|
101
|
+
|
|
102
|
+
for batch in batches:
|
|
103
|
+
yield self._run_batch(cursor=cursor, batch=batch)
|
|
104
|
+
|
|
105
|
+
def load_documents(self) -> Iterator[TDataItem]:
|
|
106
|
+
for document in self._get_all_batches():
|
|
107
|
+
yield document
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def collection_documents(
|
|
111
|
+
client: TMongoClient,
|
|
112
|
+
collection: TCollection,
|
|
113
|
+
incremental: Optional[dlt.sources.incremental[Any]] = None,
|
|
114
|
+
parallel: bool = False,
|
|
115
|
+
) -> Iterator[TDataItem]:
|
|
116
|
+
"""
|
|
117
|
+
A DLT source which loads data from a Mongo database using PyMongo.
|
|
118
|
+
Resources are automatically created for the collection.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
client (MongoClient): The PyMongo client `pymongo.MongoClient` instance.
|
|
122
|
+
collection (Collection): The collection `pymongo.collection.Collection` to load.
|
|
123
|
+
incremental (Optional[dlt.sources.incremental[Any]]): The incremental configuration.
|
|
124
|
+
parallel (bool): Option to enable parallel loading for the collection. Default is False.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
Iterable[DltResource]: A list of DLT resources for each collection to be loaded.
|
|
128
|
+
"""
|
|
129
|
+
LoaderClass = CollectionLoaderParallell if parallel else CollectionLoader
|
|
130
|
+
|
|
131
|
+
loader = LoaderClass(client, collection, incremental=incremental)
|
|
132
|
+
for data in loader.load_documents():
|
|
133
|
+
yield data
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def convert_mongo_objs(value: Any) -> Any:
|
|
137
|
+
if isinstance(value, (ObjectId, Decimal128)):
|
|
138
|
+
return str(value)
|
|
139
|
+
if isinstance(value, _datetime.datetime):
|
|
140
|
+
return ensure_pendulum_datetime(value)
|
|
141
|
+
return value
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def client_from_credentials(connection_url: str) -> TMongoClient:
|
|
145
|
+
client: TMongoClient = MongoClient(
|
|
146
|
+
connection_url, uuidRepresentation="standard", tz_aware=True
|
|
147
|
+
)
|
|
148
|
+
return client
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
@configspec
|
|
152
|
+
class MongoDbCollectionConfiguration(BaseConfiguration):
|
|
153
|
+
incremental: Optional[dlt.sources.incremental] = None # type: ignore[type-arg]
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
@configspec
|
|
157
|
+
class MongoDbCollectionResourceConfiguration(BaseConfiguration):
|
|
158
|
+
connection_url: str
|
|
159
|
+
database: Optional[str]
|
|
160
|
+
collection: str
|
|
161
|
+
incremental: Optional[dlt.sources.incremental] = None # type: ignore[type-arg]
|
|
162
|
+
write_disposition: Optional[str] = None
|
|
163
|
+
parallel: Optional[bool] = False
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
__source_name__ = "mongodb"
|
|
@@ -3,6 +3,7 @@ from typing import Callable
|
|
|
3
3
|
|
|
4
4
|
import dlt
|
|
5
5
|
|
|
6
|
+
from ingestr.src.mongodb import mongodb_collection
|
|
6
7
|
from ingestr.src.sql_database import sql_table
|
|
7
8
|
|
|
8
9
|
|
|
@@ -15,7 +16,7 @@ class SqlSource:
|
|
|
15
16
|
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
16
17
|
table_fields = table.split(".")
|
|
17
18
|
if len(table_fields) != 2:
|
|
18
|
-
raise ValueError("Table name must be in the format schema
|
|
19
|
+
raise ValueError("Table name must be in the format schema.table")
|
|
19
20
|
|
|
20
21
|
incremental = None
|
|
21
22
|
if kwargs.get("incremental_key"):
|
|
@@ -29,6 +30,9 @@ class SqlSource:
|
|
|
29
30
|
end_value=end_value,
|
|
30
31
|
)
|
|
31
32
|
|
|
33
|
+
if uri.startswith("mysql://"):
|
|
34
|
+
uri = uri.replace("mysql://", "mysql+pymysql://")
|
|
35
|
+
|
|
32
36
|
table_instance = self.table_builder(
|
|
33
37
|
credentials=uri,
|
|
34
38
|
schema=table_fields[-2],
|
|
@@ -40,6 +44,39 @@ class SqlSource:
|
|
|
40
44
|
return table_instance
|
|
41
45
|
|
|
42
46
|
|
|
47
|
+
class MongoDbSource:
|
|
48
|
+
table_builder: Callable
|
|
49
|
+
|
|
50
|
+
def __init__(self, table_builder=mongodb_collection) -> None:
|
|
51
|
+
self.table_builder = table_builder
|
|
52
|
+
|
|
53
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
54
|
+
table_fields = table.split(".")
|
|
55
|
+
if len(table_fields) != 2:
|
|
56
|
+
raise ValueError("Table name must be in the format schema.table")
|
|
57
|
+
|
|
58
|
+
incremental = None
|
|
59
|
+
if kwargs.get("incremental_key"):
|
|
60
|
+
start_value = kwargs.get("interval_start")
|
|
61
|
+
end_value = kwargs.get("interval_end")
|
|
62
|
+
|
|
63
|
+
incremental = dlt.sources.incremental(
|
|
64
|
+
kwargs.get("incremental_key", ""),
|
|
65
|
+
initial_value=start_value,
|
|
66
|
+
end_value=end_value,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
table_instance = self.table_builder(
|
|
70
|
+
connection_url=uri,
|
|
71
|
+
database=table_fields[-2],
|
|
72
|
+
collection=table_fields[-1],
|
|
73
|
+
parallel=True,
|
|
74
|
+
incremental=incremental,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
return table_instance
|
|
78
|
+
|
|
79
|
+
|
|
43
80
|
class LocalCsvSource:
|
|
44
81
|
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
45
82
|
def csv_file():
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
|
|
3
|
+
import dlt
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
from ingestr.src.sources import MongoDbSource, SqlSource
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SqlSourceTest(unittest.TestCase):
|
|
10
|
+
def test_sql_source_requires_two_fields_in_table(self):
|
|
11
|
+
source = SqlSource()
|
|
12
|
+
with pytest.raises(ValueError):
|
|
13
|
+
uri = "bigquery://my-project"
|
|
14
|
+
source.dlt_source(uri, "onetable")
|
|
15
|
+
|
|
16
|
+
with pytest.raises(ValueError):
|
|
17
|
+
uri = "bigquery://my-project"
|
|
18
|
+
source.dlt_source(uri, "onetable.with.too.many.fields")
|
|
19
|
+
|
|
20
|
+
def test_table_instance_is_created(self):
|
|
21
|
+
uri = "bigquery://my-project"
|
|
22
|
+
table = "schema.table"
|
|
23
|
+
|
|
24
|
+
# monkey patch the sql_table function
|
|
25
|
+
def sql_table(credentials, schema, table, incremental, merge_key):
|
|
26
|
+
self.assertEqual(credentials, uri)
|
|
27
|
+
self.assertEqual(schema, "schema")
|
|
28
|
+
self.assertEqual(table, "table")
|
|
29
|
+
self.assertIsNone(incremental)
|
|
30
|
+
self.assertIsNone(merge_key)
|
|
31
|
+
return dlt.resource()
|
|
32
|
+
|
|
33
|
+
source = SqlSource(table_builder=sql_table)
|
|
34
|
+
res = source.dlt_source(uri, table)
|
|
35
|
+
self.assertIsNotNone(res)
|
|
36
|
+
|
|
37
|
+
def test_table_instance_is_created_with_incremental(self):
|
|
38
|
+
uri = "bigquery://my-project"
|
|
39
|
+
table = "schema.table"
|
|
40
|
+
incremental_key = "id"
|
|
41
|
+
|
|
42
|
+
# monkey patch the sql_table function
|
|
43
|
+
def sql_table(credentials, schema, table, incremental, merge_key):
|
|
44
|
+
self.assertEqual(credentials, uri)
|
|
45
|
+
self.assertEqual(schema, "schema")
|
|
46
|
+
self.assertEqual(table, "table")
|
|
47
|
+
self.assertIsInstance(incremental, dlt.sources.incremental)
|
|
48
|
+
self.assertEqual(incremental.cursor_path, incremental_key)
|
|
49
|
+
self.assertIsNone(merge_key)
|
|
50
|
+
return dlt.resource()
|
|
51
|
+
|
|
52
|
+
source = SqlSource(table_builder=sql_table)
|
|
53
|
+
res = source.dlt_source(uri, table, incremental_key=incremental_key)
|
|
54
|
+
self.assertIsNotNone(res)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class MongoDbSourceTest(unittest.TestCase):
|
|
58
|
+
def test_sql_source_requires_two_fields_in_table(self):
|
|
59
|
+
source = MongoDbSource()
|
|
60
|
+
with pytest.raises(ValueError):
|
|
61
|
+
uri = "mongodb://my-project"
|
|
62
|
+
source.dlt_source(uri, "onetable")
|
|
63
|
+
|
|
64
|
+
with pytest.raises(ValueError):
|
|
65
|
+
uri = "mongodb://my-project"
|
|
66
|
+
source.dlt_source(uri, "onetable.with.too.many.fields")
|
|
67
|
+
|
|
68
|
+
def test_table_instance_is_created(self):
|
|
69
|
+
uri = "mongodb://my-project"
|
|
70
|
+
table = "schema.table"
|
|
71
|
+
|
|
72
|
+
# monkey patch the mongo function
|
|
73
|
+
def mongo(connection_url, database, collection, incremental, parallel):
|
|
74
|
+
self.assertEqual(connection_url, uri)
|
|
75
|
+
self.assertEqual(database, "schema")
|
|
76
|
+
self.assertEqual(collection, "table")
|
|
77
|
+
self.assertIsNone(incremental)
|
|
78
|
+
self.assertTrue(parallel)
|
|
79
|
+
return dlt.resource()
|
|
80
|
+
|
|
81
|
+
source = MongoDbSource(table_builder=mongo)
|
|
82
|
+
res = source.dlt_source(uri, table)
|
|
83
|
+
self.assertIsNotNone(res)
|
|
84
|
+
|
|
85
|
+
def test_table_instance_is_created_with_incremental(self):
|
|
86
|
+
uri = "mongodb://my-project"
|
|
87
|
+
table = "schema.table"
|
|
88
|
+
incremental_key = "id"
|
|
89
|
+
|
|
90
|
+
# monkey patch the mongo function
|
|
91
|
+
def mongo(connection_url, database, collection, incremental, parallel):
|
|
92
|
+
self.assertEqual(connection_url, uri)
|
|
93
|
+
self.assertEqual(database, "schema")
|
|
94
|
+
self.assertEqual(collection, "table")
|
|
95
|
+
self.assertIsInstance(incremental, dlt.sources.incremental)
|
|
96
|
+
self.assertEqual(incremental.cursor_path, incremental_key)
|
|
97
|
+
self.assertTrue(parallel)
|
|
98
|
+
return dlt.resource()
|
|
99
|
+
|
|
100
|
+
source = MongoDbSource(table_builder=mongo)
|
|
101
|
+
res = source.dlt_source(uri, table, incremental_key=incremental_key)
|
|
102
|
+
self.assertIsNotNone(res)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.2.1"
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -1,54 +0,0 @@
|
|
|
1
|
-
import unittest
|
|
2
|
-
|
|
3
|
-
import dlt
|
|
4
|
-
import pytest
|
|
5
|
-
|
|
6
|
-
from ingestr.src.sources import SqlSource
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class SqlSourceTest(unittest.TestCase):
|
|
10
|
-
def test_sql_source_requires_two_fields_in_table(self):
|
|
11
|
-
source = SqlSource()
|
|
12
|
-
with pytest.raises(ValueError):
|
|
13
|
-
uri = "bigquery://my-project"
|
|
14
|
-
source.dlt_source(uri, "onetable")
|
|
15
|
-
|
|
16
|
-
with pytest.raises(ValueError):
|
|
17
|
-
uri = "bigquery://my-project"
|
|
18
|
-
source.dlt_source(uri, "onetable.with.too.many.fields")
|
|
19
|
-
|
|
20
|
-
def test_table_instance_is_created(self):
|
|
21
|
-
uri = "bigquery://my-project"
|
|
22
|
-
table = "schema.table"
|
|
23
|
-
|
|
24
|
-
# monkey patch the sql_table function
|
|
25
|
-
def sql_table(credentials, schema, table, incremental, merge_key):
|
|
26
|
-
self.assertEqual(credentials, uri)
|
|
27
|
-
self.assertEqual(schema, "schema")
|
|
28
|
-
self.assertEqual(table, "table")
|
|
29
|
-
self.assertIsNone(incremental)
|
|
30
|
-
self.assertIsNone(merge_key)
|
|
31
|
-
return dlt.resource()
|
|
32
|
-
|
|
33
|
-
source = SqlSource(table_builder=sql_table)
|
|
34
|
-
res = source.dlt_source(uri, table)
|
|
35
|
-
self.assertIsNotNone(res)
|
|
36
|
-
|
|
37
|
-
def test_table_instance_is_created_with_incremental(self):
|
|
38
|
-
uri = "bigquery://my-project"
|
|
39
|
-
table = "schema.table"
|
|
40
|
-
incremental_key = "id"
|
|
41
|
-
|
|
42
|
-
# monkey patch the sql_table function
|
|
43
|
-
def sql_table(credentials, schema, table, incremental, merge_key):
|
|
44
|
-
self.assertEqual(credentials, uri)
|
|
45
|
-
self.assertEqual(schema, "schema")
|
|
46
|
-
self.assertEqual(table, "table")
|
|
47
|
-
self.assertIsInstance(incremental, dlt.sources.incremental)
|
|
48
|
-
self.assertEqual(incremental.cursor_path, incremental_key)
|
|
49
|
-
self.assertIsNone(merge_key)
|
|
50
|
-
return dlt.resource()
|
|
51
|
-
|
|
52
|
-
source = SqlSource(table_builder=sql_table)
|
|
53
|
-
res = source.dlt_source(uri, table, incremental_key=incremental_key)
|
|
54
|
-
self.assertIsNotNone(res)
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.1.4"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|