ingestr 0.1.4__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

Files changed (64) hide show
  1. {ingestr-0.1.4 → ingestr-0.2.1}/PKG-INFO +3 -1
  2. {ingestr-0.1.4 → ingestr-0.2.1}/README.md +1 -0
  3. {ingestr-0.1.4 → ingestr-0.2.1}/docs/.vitepress/config.mjs +2 -1
  4. ingestr-0.2.1/docs/supported-sources/mongodb.md +24 -0
  5. {ingestr-0.1.4 → ingestr-0.2.1}/docs/supported-sources/overview.md +1 -0
  6. {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/src/factory.py +3 -1
  7. ingestr-0.2.1/ingestr/src/factory_test.py +13 -0
  8. ingestr-0.2.1/ingestr/src/mongodb/__init__.py +103 -0
  9. ingestr-0.2.1/ingestr/src/mongodb/helpers.py +166 -0
  10. {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/src/sources.py +38 -1
  11. ingestr-0.2.1/ingestr/src/sources_test.py +102 -0
  12. ingestr-0.2.1/ingestr/src/version.py +1 -0
  13. {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/testdata/test_create_replace.db +0 -0
  14. {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/testdata/test_delete_insert_with_timerange.db +0 -0
  15. {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/testdata/test_delete_insert_without_primary_key.db +0 -0
  16. {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/testdata/test_merge_with_primary_key.db +0 -0
  17. {ingestr-0.1.4 → ingestr-0.2.1}/requirements.txt +1 -0
  18. ingestr-0.1.4/ingestr/src/sources_test.py +0 -54
  19. ingestr-0.1.4/ingestr/src/version.py +0 -1
  20. {ingestr-0.1.4 → ingestr-0.2.1}/.dockerignore +0 -0
  21. {ingestr-0.1.4 → ingestr-0.2.1}/.github/workflows/deploy-docs.yml +0 -0
  22. {ingestr-0.1.4 → ingestr-0.2.1}/.github/workflows/docker.yml +0 -0
  23. {ingestr-0.1.4 → ingestr-0.2.1}/.gitignore +0 -0
  24. {ingestr-0.1.4 → ingestr-0.2.1}/Dockerfile +0 -0
  25. {ingestr-0.1.4 → ingestr-0.2.1}/LICENSE.md +0 -0
  26. {ingestr-0.1.4 → ingestr-0.2.1}/Makefile +0 -0
  27. {ingestr-0.1.4 → ingestr-0.2.1}/docs/.vitepress/theme/custom.css +0 -0
  28. {ingestr-0.1.4 → ingestr-0.2.1}/docs/.vitepress/theme/index.js +0 -0
  29. {ingestr-0.1.4 → ingestr-0.2.1}/docs/commands/example-uris.md +0 -0
  30. {ingestr-0.1.4 → ingestr-0.2.1}/docs/commands/ingest.md +0 -0
  31. {ingestr-0.1.4 → ingestr-0.2.1}/docs/getting-started/core-concepts.md +0 -0
  32. {ingestr-0.1.4 → ingestr-0.2.1}/docs/getting-started/incremental-loading.md +0 -0
  33. {ingestr-0.1.4 → ingestr-0.2.1}/docs/getting-started/quickstart.md +0 -0
  34. {ingestr-0.1.4 → ingestr-0.2.1}/docs/getting-started/telemetry.md +0 -0
  35. {ingestr-0.1.4 → ingestr-0.2.1}/docs/index.md +0 -0
  36. {ingestr-0.1.4 → ingestr-0.2.1}/docs/supported-sources/bigquery.md +0 -0
  37. {ingestr-0.1.4 → ingestr-0.2.1}/docs/supported-sources/csv.md +0 -0
  38. {ingestr-0.1.4 → ingestr-0.2.1}/docs/supported-sources/databricks.md +0 -0
  39. {ingestr-0.1.4 → ingestr-0.2.1}/docs/supported-sources/duckdb.md +0 -0
  40. {ingestr-0.1.4 → ingestr-0.2.1}/docs/supported-sources/mssql.md +0 -0
  41. {ingestr-0.1.4 → ingestr-0.2.1}/docs/supported-sources/mysql.md +0 -0
  42. {ingestr-0.1.4 → ingestr-0.2.1}/docs/supported-sources/oracle.md +0 -0
  43. {ingestr-0.1.4 → ingestr-0.2.1}/docs/supported-sources/postgres.md +0 -0
  44. {ingestr-0.1.4 → ingestr-0.2.1}/docs/supported-sources/redshift.md +0 -0
  45. {ingestr-0.1.4 → ingestr-0.2.1}/docs/supported-sources/snowflake.md +0 -0
  46. {ingestr-0.1.4 → ingestr-0.2.1}/docs/supported-sources/sqlite.md +0 -0
  47. {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/main.py +0 -0
  48. {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/main_test.py +0 -0
  49. {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/src/destinations.py +0 -0
  50. {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/src/destinations_test.py +0 -0
  51. {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/src/sql_database/__init__.py +0 -0
  52. {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/src/sql_database/helpers.py +0 -0
  53. {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/src/sql_database/schema_types.py +0 -0
  54. {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/src/sql_database/settings.py +0 -0
  55. {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/src/telemetry/event.py +0 -0
  56. {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/src/testdata/fakebqcredentials.json +0 -0
  57. {ingestr-0.1.4 → ingestr-0.2.1}/ingestr/testdata/.gitignore +0 -0
  58. {ingestr-0.1.4 → ingestr-0.2.1}/package-lock.json +0 -0
  59. {ingestr-0.1.4 → ingestr-0.2.1}/package.json +0 -0
  60. {ingestr-0.1.4 → ingestr-0.2.1}/pyproject.toml +0 -0
  61. {ingestr-0.1.4 → ingestr-0.2.1}/requirements-dev.txt +0 -0
  62. {ingestr-0.1.4 → ingestr-0.2.1}/resources/demo.gif +0 -0
  63. {ingestr-0.1.4 → ingestr-0.2.1}/resources/demo.tape +0 -0
  64. {ingestr-0.1.4 → ingestr-0.2.1}/resources/ingestr.svg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ingestr
3
- Version: 0.1.4
3
+ Version: 0.2.1
4
4
  Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
5
5
  Project-URL: Homepage, https://github.com/bruin-data/ingestr
6
6
  Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
@@ -23,6 +23,7 @@ Requires-Dist: google-cloud-bigquery-storage==2.24.0
23
23
  Requires-Dist: pendulum==3.0.0
24
24
  Requires-Dist: psycopg2-binary==2.9.9
25
25
  Requires-Dist: py-machineid==0.5.1
26
+ Requires-Dist: pymongo==4.6.2
26
27
  Requires-Dist: pymysql==1.1.0
27
28
  Requires-Dist: pyodbc==5.1.0
28
29
  Requires-Dist: redshift-connector==2.1.0
@@ -100,6 +101,7 @@ Join our Slack community [here](https://join.slack.com/t/bruindatacommunity/shar
100
101
  | DuckDB | ✅ | ✅ |
101
102
  | Microsoft SQL Server | ✅ | ✅ |
102
103
  | Local CSV file | ✅ | ✅ |
104
+ | MongoDB | ✅ | ❌ |
103
105
  | Oracle | ✅ | ❌ |
104
106
  | SQLite | ✅ | ❌ |
105
107
  | MySQL | ✅ | ❌ |
@@ -61,6 +61,7 @@ Join our Slack community [here](https://join.slack.com/t/bruindatacommunity/shar
61
61
  | DuckDB | ✅ | ✅ |
62
62
  | Microsoft SQL Server | ✅ | ✅ |
63
63
  | Local CSV file | ✅ | ✅ |
64
+ | MongoDB | ✅ | ❌ |
64
65
  | Oracle | ✅ | ❌ |
65
66
  | SQLite | ✅ | ❌ |
66
67
  | MySQL | ✅ | ❌ |
@@ -46,15 +46,16 @@ export default defineConfig({
46
46
  {
47
47
  text: "Sources & Destinations",
48
48
  items: [
49
- { text: "Overview", link: "/supported-sources/overview.md" },
50
49
  { text: "AWS Redshift", link: "/supported-sources/redshift.md" },
51
50
  { text: "Databricks", link: "/supported-sources/databricks.md" },
52
51
  { text: "DuckDB", link: "/supported-sources/duckdb.md" },
53
52
  { text: "Google BigQuery", link: "/supported-sources/bigquery.md" },
54
53
  { text: "Local CSV Files", link: "/supported-sources/csv.md" },
55
54
  { text: "Microsoft SQL Server", link: "/supported-sources/mssql.md" },
55
+ { text: "MongoDB", link: "/supported-sources/mongodb.md" },
56
56
  { text: "MySQL", link: "/supported-sources/mysql.md" },
57
57
  { text: "Oracle", link: "/supported-sources/oracle.md" },
58
+ { text: "Overview", link: "/supported-sources/overview.md" },
58
59
  { text: "Postgres", link: "/supported-sources/postgres.md" },
59
60
  { text: "Snowflake", link: "/supported-sources/snowflake.md" },
60
61
  { text: "SQLite", link: "/supported-sources/sqlite.md" },
@@ -0,0 +1,24 @@
1
+ # MongoDB
2
+ MongoDB is a popular, open-source NoSQL database known for its flexibility, scalability, and wide adoption in a variety of applications.
3
+
4
+ ingestr supports MongoDB as a source.
5
+
6
+ ## URI Format
7
+ The URI format for MongoDB is as follows:
8
+
9
+ ```plaintext
10
+ mongodb://user:password@host:port
11
+ ```
12
+
13
+ URI parameters:
14
+ - `user`: the user name to connect to the database
15
+ - `password`: the password for the user
16
+ - `host`: the host address of the database server
17
+ - `port`: the port number the database server is listening on (default is 27017 for MongoDB)
18
+
19
+
20
+ > [!CAUTION]
21
+ > Do not put the database name at the end of the URI for MongoDB, instead make it a part of `--source-table` option as `database.collection` format.
22
+
23
+
24
+ You can read more about MongoDB's connection string format [here](https://docs.mongodb.com/manual/reference/connection-string/).
@@ -11,6 +11,7 @@ ingestr supports the following sources and destinations:
11
11
  | DuckDB | ✅ | ✅ |
12
12
  | Microsoft SQL Server | ✅ | ✅ |
13
13
  | Local CSV file | ✅ | ✅ |
14
+ | MongoDB | ✅ | ❌ |
14
15
  | Oracle | ✅ | ❌ |
15
16
  | SQLite | ✅ | ❌ |
16
17
  | MySQL | ✅ | ❌ |
@@ -13,7 +13,7 @@ from ingestr.src.destinations import (
13
13
  RedshiftDestination,
14
14
  SnowflakeDestination,
15
15
  )
16
- from ingestr.src.sources import LocalCsvSource, SqlSource
16
+ from ingestr.src.sources import LocalCsvSource, MongoDbSource, SqlSource
17
17
 
18
18
  SQL_SOURCE_SCHEMES = [
19
19
  "bigquery",
@@ -77,6 +77,8 @@ class SourceDestinationFactory:
77
77
  return SqlSource()
78
78
  elif self.source_scheme == "csv":
79
79
  return LocalCsvSource()
80
+ elif self.source_scheme == "mongodb":
81
+ return MongoDbSource()
80
82
  else:
81
83
  raise ValueError(f"Unsupported source scheme: {self.source_scheme}")
82
84
 
@@ -0,0 +1,13 @@
1
+ from ingestr.src.factory import parse_scheme_from_uri
2
+
3
+
4
+ def test_scheme_is_parsed_from_uri_correctly():
5
+ assert parse_scheme_from_uri("bigquery://my-project") == "bigquery"
6
+ assert parse_scheme_from_uri("http://localhost:8080") == "http"
7
+ assert parse_scheme_from_uri("file:///tmp/myfile") == "file"
8
+ assert parse_scheme_from_uri("https://example.com?query=123") == "https"
9
+ assert parse_scheme_from_uri("ftp://ftp.example.com/downloads/file.zip") == "ftp"
10
+ assert (
11
+ parse_scheme_from_uri("redshift+psycopg2://user:pw@host") == "redshift+psycopg2"
12
+ )
13
+ assert parse_scheme_from_uri("mysql+pymysql://user:pw@host") == "mysql+pymysql"
@@ -0,0 +1,103 @@
1
+ """Source that loads collections form any a mongo database, supports incremental loads."""
2
+
3
+ from typing import Any, Iterable, List, Optional
4
+
5
+ import dlt
6
+ from dlt.sources import DltResource
7
+
8
+ from .helpers import (
9
+ MongoDbCollectionConfiguration,
10
+ MongoDbCollectionResourceConfiguration,
11
+ client_from_credentials,
12
+ collection_documents,
13
+ )
14
+
15
+
16
+ @dlt.source
17
+ def mongodb(
18
+ connection_url: str = dlt.secrets.value,
19
+ database: Optional[str] = dlt.config.value,
20
+ collection_names: Optional[List[str]] = dlt.config.value,
21
+ incremental: Optional[dlt.sources.incremental] = None, # type: ignore[type-arg]
22
+ write_disposition: Optional[str] = dlt.config.value,
23
+ parallel: Optional[bool] = dlt.config.value,
24
+ ) -> Iterable[DltResource]:
25
+ """
26
+ A DLT source which loads data from a mongo database using PyMongo.
27
+ Resources are automatically created for each collection in the database or from the given list of collection.
28
+
29
+ Args:
30
+ connection_url (str): Database connection_url.
31
+ database (Optional[str]): Selected database name, it will use the default database if not passed.
32
+ collection_names (Optional[List[str]]): The list of collections `pymongo.collection.Collection` to load.
33
+ incremental (Optional[dlt.sources.incremental]): Option to enable incremental loading for the collection.
34
+ E.g., `incremental=dlt.sources.incremental('updated_at', pendulum.parse('2022-01-01T00:00:00Z'))`
35
+ write_disposition (str): Write disposition of the resource.
36
+ parallel (Optional[bool]): Option to enable parallel loading for the collection. Default is False.
37
+ Returns:
38
+ Iterable[DltResource]: A list of DLT resources for each collection to be loaded.
39
+ """
40
+
41
+ # set up mongo client
42
+ client = client_from_credentials(connection_url)
43
+ if not database:
44
+ mongo_database = client.get_default_database()
45
+ else:
46
+ mongo_database = client[database]
47
+
48
+ # use provided collection or all conllections
49
+ if not collection_names:
50
+ collection_names = mongo_database.list_collection_names()
51
+
52
+ collection_list = [mongo_database[collection] for collection in collection_names]
53
+
54
+ for collection in collection_list:
55
+ yield dlt.resource( # type: ignore
56
+ collection_documents,
57
+ name=collection.name,
58
+ primary_key="_id",
59
+ write_disposition=write_disposition,
60
+ spec=MongoDbCollectionConfiguration,
61
+ )(client, collection, incremental=incremental, parallel=parallel)
62
+
63
+
64
+ @dlt.common.configuration.with_config(
65
+ sections=("sources", "mongodb"), spec=MongoDbCollectionResourceConfiguration
66
+ )
67
+ def mongodb_collection(
68
+ connection_url: str = dlt.secrets.value,
69
+ database: Optional[str] = dlt.config.value,
70
+ collection: str = dlt.config.value,
71
+ incremental: Optional[dlt.sources.incremental] = None, # type: ignore[type-arg]
72
+ write_disposition: Optional[str] = dlt.config.value,
73
+ parallel: Optional[bool] = dlt.config.value,
74
+ ) -> Any:
75
+ """
76
+ A DLT source which loads a collection from a mongo database using PyMongo.
77
+
78
+ Args:
79
+ connection_url (str): Database connection_url.
80
+ database (Optional[str]): Selected database name, it will use the default database if not passed.
81
+ collection (str): The collection name to load.
82
+ incremental (Optional[dlt.sources.incremental]): Option to enable incremental loading for the collection.
83
+ E.g., `incremental=dlt.sources.incremental('updated_at', pendulum.parse('2022-01-01T00:00:00Z'))`
84
+ write_disposition (str): Write disposition of the resource.
85
+ parallel (Optional[bool]): Option to enable parallel loading for the collection. Default is False.
86
+ Returns:
87
+ Iterable[DltResource]: A list of DLT resources for each collection to be loaded.
88
+ """
89
+ # set up mongo client
90
+ client = client_from_credentials(connection_url)
91
+ if not database:
92
+ mongo_database = client.get_default_database()
93
+ else:
94
+ mongo_database = client[database]
95
+
96
+ collection_obj = mongo_database[collection]
97
+
98
+ return dlt.resource( # type: ignore
99
+ collection_documents,
100
+ name=collection_obj.name,
101
+ primary_key="_id",
102
+ write_disposition=write_disposition,
103
+ )(client, collection_obj, incremental=incremental, parallel=parallel)
@@ -0,0 +1,166 @@
1
+ """Mongo database source helpers"""
2
+
3
+ from itertools import islice
4
+ from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Tuple
5
+
6
+ import dlt
7
+ from bson.decimal128 import Decimal128
8
+ from bson.objectid import ObjectId
9
+ from dlt.common.configuration.specs import BaseConfiguration, configspec
10
+ from dlt.common.time import ensure_pendulum_datetime
11
+ from dlt.common.typing import TDataItem
12
+ from dlt.common.utils import map_nested_in_place
13
+ from pendulum import _datetime
14
+ from pymongo import ASCENDING, DESCENDING, MongoClient
15
+ from pymongo.collection import Collection
16
+ from pymongo.cursor import Cursor
17
+
18
+ if TYPE_CHECKING:
19
+ TMongoClient = MongoClient[Any]
20
+ TCollection = Collection[Any] # type: ignore
21
+ TCursor = Cursor[Any]
22
+ else:
23
+ TMongoClient = Any
24
+ TCollection = Any
25
+ TCursor = Any
26
+
27
+ CHUNK_SIZE = 10000
28
+
29
+
30
+ class CollectionLoader:
31
+ def __init__(
32
+ self,
33
+ client: TMongoClient,
34
+ collection: TCollection,
35
+ incremental: Optional[dlt.sources.incremental[Any]] = None,
36
+ ) -> None:
37
+ self.client = client
38
+ self.collection = collection
39
+ self.incremental = incremental
40
+ if incremental:
41
+ self.cursor_field = incremental.cursor_path
42
+ self.last_value = incremental.last_value
43
+ else:
44
+ self.cursor_column = None
45
+ self.last_value = None
46
+
47
+ @property
48
+ def _filter_op(self) -> Dict[str, Any]:
49
+ if not self.incremental or not self.last_value:
50
+ return {}
51
+ if self.incremental.last_value_func is max:
52
+ return {self.cursor_field: {"$gte": self.last_value}}
53
+ elif self.incremental.last_value_func is min:
54
+ return {self.cursor_field: {"$lt": self.last_value}}
55
+ return {}
56
+
57
+ def load_documents(self) -> Iterator[TDataItem]:
58
+ cursor = self.collection.find(self._filter_op)
59
+ while docs_slice := list(islice(cursor, CHUNK_SIZE)):
60
+ yield map_nested_in_place(convert_mongo_objs, docs_slice)
61
+
62
+
63
+ class CollectionLoaderParallell(CollectionLoader):
64
+ @property
65
+ def _sort_op(self) -> List[Optional[Tuple[str, int]]]:
66
+ if not self.incremental or not self.last_value:
67
+ return []
68
+ if self.incremental.last_value_func is max:
69
+ return [(self.cursor_field, ASCENDING)]
70
+ elif self.incremental.last_value_func is min:
71
+ return [(self.cursor_field, DESCENDING)]
72
+ return []
73
+
74
+ def _get_document_count(self) -> int:
75
+ return self.collection.count_documents(filter=self._filter_op)
76
+
77
+ def _create_batches(self) -> List[Dict[str, int]]:
78
+ doc_count = self._get_document_count()
79
+ return [
80
+ dict(skip=sk, limit=CHUNK_SIZE) for sk in range(0, doc_count, CHUNK_SIZE)
81
+ ]
82
+
83
+ def _get_cursor(self) -> TCursor:
84
+ cursor = self.collection.find(filter=self._filter_op)
85
+ if self._sort_op:
86
+ cursor = cursor.sort(self._sort_op) # type: ignore
87
+ return cursor
88
+
89
+ @dlt.defer
90
+ def _run_batch(self, cursor: TCursor, batch: Dict[str, int]) -> TDataItem:
91
+ cursor = cursor.clone()
92
+
93
+ data = []
94
+ for document in cursor.skip(batch["skip"]).limit(batch["limit"]):
95
+ data.append(map_nested_in_place(convert_mongo_objs, document))
96
+ return data
97
+
98
+ def _get_all_batches(self) -> Iterator[TDataItem]:
99
+ batches = self._create_batches()
100
+ cursor = self._get_cursor()
101
+
102
+ for batch in batches:
103
+ yield self._run_batch(cursor=cursor, batch=batch)
104
+
105
+ def load_documents(self) -> Iterator[TDataItem]:
106
+ for document in self._get_all_batches():
107
+ yield document
108
+
109
+
110
+ def collection_documents(
111
+ client: TMongoClient,
112
+ collection: TCollection,
113
+ incremental: Optional[dlt.sources.incremental[Any]] = None,
114
+ parallel: bool = False,
115
+ ) -> Iterator[TDataItem]:
116
+ """
117
+ A DLT source which loads data from a Mongo database using PyMongo.
118
+ Resources are automatically created for the collection.
119
+
120
+ Args:
121
+ client (MongoClient): The PyMongo client `pymongo.MongoClient` instance.
122
+ collection (Collection): The collection `pymongo.collection.Collection` to load.
123
+ incremental (Optional[dlt.sources.incremental[Any]]): The incremental configuration.
124
+ parallel (bool): Option to enable parallel loading for the collection. Default is False.
125
+
126
+ Returns:
127
+ Iterable[DltResource]: A list of DLT resources for each collection to be loaded.
128
+ """
129
+ LoaderClass = CollectionLoaderParallell if parallel else CollectionLoader
130
+
131
+ loader = LoaderClass(client, collection, incremental=incremental)
132
+ for data in loader.load_documents():
133
+ yield data
134
+
135
+
136
+ def convert_mongo_objs(value: Any) -> Any:
137
+ if isinstance(value, (ObjectId, Decimal128)):
138
+ return str(value)
139
+ if isinstance(value, _datetime.datetime):
140
+ return ensure_pendulum_datetime(value)
141
+ return value
142
+
143
+
144
+ def client_from_credentials(connection_url: str) -> TMongoClient:
145
+ client: TMongoClient = MongoClient(
146
+ connection_url, uuidRepresentation="standard", tz_aware=True
147
+ )
148
+ return client
149
+
150
+
151
+ @configspec
152
+ class MongoDbCollectionConfiguration(BaseConfiguration):
153
+ incremental: Optional[dlt.sources.incremental] = None # type: ignore[type-arg]
154
+
155
+
156
+ @configspec
157
+ class MongoDbCollectionResourceConfiguration(BaseConfiguration):
158
+ connection_url: str
159
+ database: Optional[str]
160
+ collection: str
161
+ incremental: Optional[dlt.sources.incremental] = None # type: ignore[type-arg]
162
+ write_disposition: Optional[str] = None
163
+ parallel: Optional[bool] = False
164
+
165
+
166
+ __source_name__ = "mongodb"
@@ -3,6 +3,7 @@ from typing import Callable
3
3
 
4
4
  import dlt
5
5
 
6
+ from ingestr.src.mongodb import mongodb_collection
6
7
  from ingestr.src.sql_database import sql_table
7
8
 
8
9
 
@@ -15,7 +16,7 @@ class SqlSource:
15
16
  def dlt_source(self, uri: str, table: str, **kwargs):
16
17
  table_fields = table.split(".")
17
18
  if len(table_fields) != 2:
18
- raise ValueError("Table name must be in the format schema.<table>")
19
+ raise ValueError("Table name must be in the format schema.table")
19
20
 
20
21
  incremental = None
21
22
  if kwargs.get("incremental_key"):
@@ -29,6 +30,9 @@ class SqlSource:
29
30
  end_value=end_value,
30
31
  )
31
32
 
33
+ if uri.startswith("mysql://"):
34
+ uri = uri.replace("mysql://", "mysql+pymysql://")
35
+
32
36
  table_instance = self.table_builder(
33
37
  credentials=uri,
34
38
  schema=table_fields[-2],
@@ -40,6 +44,39 @@ class SqlSource:
40
44
  return table_instance
41
45
 
42
46
 
47
+ class MongoDbSource:
48
+ table_builder: Callable
49
+
50
+ def __init__(self, table_builder=mongodb_collection) -> None:
51
+ self.table_builder = table_builder
52
+
53
+ def dlt_source(self, uri: str, table: str, **kwargs):
54
+ table_fields = table.split(".")
55
+ if len(table_fields) != 2:
56
+ raise ValueError("Table name must be in the format schema.table")
57
+
58
+ incremental = None
59
+ if kwargs.get("incremental_key"):
60
+ start_value = kwargs.get("interval_start")
61
+ end_value = kwargs.get("interval_end")
62
+
63
+ incremental = dlt.sources.incremental(
64
+ kwargs.get("incremental_key", ""),
65
+ initial_value=start_value,
66
+ end_value=end_value,
67
+ )
68
+
69
+ table_instance = self.table_builder(
70
+ connection_url=uri,
71
+ database=table_fields[-2],
72
+ collection=table_fields[-1],
73
+ parallel=True,
74
+ incremental=incremental,
75
+ )
76
+
77
+ return table_instance
78
+
79
+
43
80
  class LocalCsvSource:
44
81
  def dlt_source(self, uri: str, table: str, **kwargs):
45
82
  def csv_file():
@@ -0,0 +1,102 @@
1
+ import unittest
2
+
3
+ import dlt
4
+ import pytest
5
+
6
+ from ingestr.src.sources import MongoDbSource, SqlSource
7
+
8
+
9
+ class SqlSourceTest(unittest.TestCase):
10
+ def test_sql_source_requires_two_fields_in_table(self):
11
+ source = SqlSource()
12
+ with pytest.raises(ValueError):
13
+ uri = "bigquery://my-project"
14
+ source.dlt_source(uri, "onetable")
15
+
16
+ with pytest.raises(ValueError):
17
+ uri = "bigquery://my-project"
18
+ source.dlt_source(uri, "onetable.with.too.many.fields")
19
+
20
+ def test_table_instance_is_created(self):
21
+ uri = "bigquery://my-project"
22
+ table = "schema.table"
23
+
24
+ # monkey patch the sql_table function
25
+ def sql_table(credentials, schema, table, incremental, merge_key):
26
+ self.assertEqual(credentials, uri)
27
+ self.assertEqual(schema, "schema")
28
+ self.assertEqual(table, "table")
29
+ self.assertIsNone(incremental)
30
+ self.assertIsNone(merge_key)
31
+ return dlt.resource()
32
+
33
+ source = SqlSource(table_builder=sql_table)
34
+ res = source.dlt_source(uri, table)
35
+ self.assertIsNotNone(res)
36
+
37
+ def test_table_instance_is_created_with_incremental(self):
38
+ uri = "bigquery://my-project"
39
+ table = "schema.table"
40
+ incremental_key = "id"
41
+
42
+ # monkey patch the sql_table function
43
+ def sql_table(credentials, schema, table, incremental, merge_key):
44
+ self.assertEqual(credentials, uri)
45
+ self.assertEqual(schema, "schema")
46
+ self.assertEqual(table, "table")
47
+ self.assertIsInstance(incremental, dlt.sources.incremental)
48
+ self.assertEqual(incremental.cursor_path, incremental_key)
49
+ self.assertIsNone(merge_key)
50
+ return dlt.resource()
51
+
52
+ source = SqlSource(table_builder=sql_table)
53
+ res = source.dlt_source(uri, table, incremental_key=incremental_key)
54
+ self.assertIsNotNone(res)
55
+
56
+
57
+ class MongoDbSourceTest(unittest.TestCase):
58
+ def test_sql_source_requires_two_fields_in_table(self):
59
+ source = MongoDbSource()
60
+ with pytest.raises(ValueError):
61
+ uri = "mongodb://my-project"
62
+ source.dlt_source(uri, "onetable")
63
+
64
+ with pytest.raises(ValueError):
65
+ uri = "mongodb://my-project"
66
+ source.dlt_source(uri, "onetable.with.too.many.fields")
67
+
68
+ def test_table_instance_is_created(self):
69
+ uri = "mongodb://my-project"
70
+ table = "schema.table"
71
+
72
+ # monkey patch the mongo function
73
+ def mongo(connection_url, database, collection, incremental, parallel):
74
+ self.assertEqual(connection_url, uri)
75
+ self.assertEqual(database, "schema")
76
+ self.assertEqual(collection, "table")
77
+ self.assertIsNone(incremental)
78
+ self.assertTrue(parallel)
79
+ return dlt.resource()
80
+
81
+ source = MongoDbSource(table_builder=mongo)
82
+ res = source.dlt_source(uri, table)
83
+ self.assertIsNotNone(res)
84
+
85
+ def test_table_instance_is_created_with_incremental(self):
86
+ uri = "mongodb://my-project"
87
+ table = "schema.table"
88
+ incremental_key = "id"
89
+
90
+ # monkey patch the mongo function
91
+ def mongo(connection_url, database, collection, incremental, parallel):
92
+ self.assertEqual(connection_url, uri)
93
+ self.assertEqual(database, "schema")
94
+ self.assertEqual(collection, "table")
95
+ self.assertIsInstance(incremental, dlt.sources.incremental)
96
+ self.assertEqual(incremental.cursor_path, incremental_key)
97
+ self.assertTrue(parallel)
98
+ return dlt.resource()
99
+
100
+ source = MongoDbSource(table_builder=mongo)
101
+ res = source.dlt_source(uri, table, incremental_key=incremental_key)
102
+ self.assertIsNotNone(res)
@@ -0,0 +1 @@
1
+ __version__ = "0.2.1"
@@ -7,6 +7,7 @@ google-cloud-bigquery-storage==2.24.0
7
7
  pendulum==3.0.0
8
8
  psycopg2-binary==2.9.9
9
9
  py-machineid==0.5.1
10
+ pymongo==4.6.2
10
11
  pymysql==1.1.0
11
12
  pyodbc==5.1.0
12
13
  redshift-connector==2.1.0
@@ -1,54 +0,0 @@
1
- import unittest
2
-
3
- import dlt
4
- import pytest
5
-
6
- from ingestr.src.sources import SqlSource
7
-
8
-
9
- class SqlSourceTest(unittest.TestCase):
10
- def test_sql_source_requires_two_fields_in_table(self):
11
- source = SqlSource()
12
- with pytest.raises(ValueError):
13
- uri = "bigquery://my-project"
14
- source.dlt_source(uri, "onetable")
15
-
16
- with pytest.raises(ValueError):
17
- uri = "bigquery://my-project"
18
- source.dlt_source(uri, "onetable.with.too.many.fields")
19
-
20
- def test_table_instance_is_created(self):
21
- uri = "bigquery://my-project"
22
- table = "schema.table"
23
-
24
- # monkey patch the sql_table function
25
- def sql_table(credentials, schema, table, incremental, merge_key):
26
- self.assertEqual(credentials, uri)
27
- self.assertEqual(schema, "schema")
28
- self.assertEqual(table, "table")
29
- self.assertIsNone(incremental)
30
- self.assertIsNone(merge_key)
31
- return dlt.resource()
32
-
33
- source = SqlSource(table_builder=sql_table)
34
- res = source.dlt_source(uri, table)
35
- self.assertIsNotNone(res)
36
-
37
- def test_table_instance_is_created_with_incremental(self):
38
- uri = "bigquery://my-project"
39
- table = "schema.table"
40
- incremental_key = "id"
41
-
42
- # monkey patch the sql_table function
43
- def sql_table(credentials, schema, table, incremental, merge_key):
44
- self.assertEqual(credentials, uri)
45
- self.assertEqual(schema, "schema")
46
- self.assertEqual(table, "table")
47
- self.assertIsInstance(incremental, dlt.sources.incremental)
48
- self.assertEqual(incremental.cursor_path, incremental_key)
49
- self.assertIsNone(merge_key)
50
- return dlt.resource()
51
-
52
- source = SqlSource(table_builder=sql_table)
53
- res = source.dlt_source(uri, table, incremental_key=incremental_key)
54
- self.assertIsNotNone(res)
@@ -1 +0,0 @@
1
- __version__ = "0.1.4"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes