airbyte-internal-ops 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. {airbyte_internal_ops-0.4.1.dist-info → airbyte_internal_ops-0.5.0.dist-info}/METADATA +1 -1
  2. {airbyte_internal_ops-0.4.1.dist-info → airbyte_internal_ops-0.5.0.dist-info}/RECORD +13 -52
  3. airbyte_ops_mcp/cli/cloud.py +42 -3
  4. airbyte_ops_mcp/cloud_admin/api_client.py +473 -0
  5. airbyte_ops_mcp/cloud_admin/models.py +56 -0
  6. airbyte_ops_mcp/mcp/cloud_connector_versions.py +460 -0
  7. airbyte_ops_mcp/mcp/prerelease.py +6 -46
  8. airbyte_ops_mcp/regression_tests/ci_output.py +151 -71
  9. airbyte_ops_mcp/regression_tests/http_metrics.py +21 -2
  10. airbyte_ops_mcp/regression_tests/models.py +6 -0
  11. airbyte_ops_mcp/telemetry.py +162 -0
  12. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/.gitignore +0 -1
  13. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/README.md +0 -420
  14. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/__init__.py +0 -2
  15. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/commons/__init__.py +0 -1
  16. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/commons/backends/__init__.py +0 -8
  17. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/commons/backends/base_backend.py +0 -16
  18. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/commons/backends/duckdb_backend.py +0 -87
  19. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/commons/backends/file_backend.py +0 -165
  20. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/commons/connection_objects_retrieval.py +0 -377
  21. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/commons/connector_runner.py +0 -247
  22. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/commons/errors.py +0 -7
  23. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/commons/evaluation_modes.py +0 -25
  24. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/commons/hacks.py +0 -23
  25. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/commons/json_schema_helper.py +0 -384
  26. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/commons/mitm_addons.py +0 -37
  27. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/commons/models.py +0 -595
  28. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/commons/proxy.py +0 -207
  29. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/commons/secret_access.py +0 -47
  30. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/commons/segment_tracking.py +0 -45
  31. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/commons/utils.py +0 -214
  32. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/conftest.py.disabled +0 -751
  33. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/consts.py +0 -4
  34. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/poetry.lock +0 -4480
  35. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/pytest.ini +0 -9
  36. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/regression_tests/__init__.py +0 -1
  37. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/regression_tests/test_check.py +0 -61
  38. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/regression_tests/test_discover.py +0 -117
  39. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/regression_tests/test_read.py +0 -627
  40. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/regression_tests/test_spec.py +0 -43
  41. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/report.py +0 -542
  42. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/stash_keys.py +0 -38
  43. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/templates/__init__.py +0 -0
  44. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/templates/private_details.html.j2 +0 -305
  45. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/templates/report.html.j2 +0 -515
  46. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/utils.py +0 -187
  47. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/validation_tests/__init__.py +0 -0
  48. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/validation_tests/test_check.py +0 -61
  49. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/validation_tests/test_discover.py +0 -217
  50. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/validation_tests/test_read.py +0 -177
  51. airbyte_ops_mcp/_legacy/airbyte_ci/connector_live_tests/validation_tests/test_spec.py +0 -631
  52. {airbyte_internal_ops-0.4.1.dist-info → airbyte_internal_ops-0.5.0.dist-info}/WHEEL +0 -0
  53. {airbyte_internal_ops-0.4.1.dist-info → airbyte_internal_ops-0.5.0.dist-info}/entry_points.txt +0 -0
@@ -1,420 +0,0 @@
1
- # Connector Live Testing
2
-
3
- This project contains utilities for running connector tests against live data.
4
-
5
- ## Requirements
6
-
7
- - `docker`
8
- - `Python ^3.11`
9
- - `pipx`
10
- - `poetry`
11
-
12
- ## Install
13
-
14
- ```bash
15
- # From airbyte-ci/connectors/live-tests
16
- poetry install
17
- ```
18
-
19
- Note that `poetry lock` + `poetry install` didn't seem to have impact on the version of connection_retriever. In order to update this dependency to the latest, I had to `poetry add git+https://github.com/airbytehq/airbyte-platform-internal.git@master#subdirectory=tools/connection-retriever`.
20
-
21
- ## Regression tests
22
-
23
- We created a regression test suite to run tests to compare the outputs of connector commands on different versions of the same connector.
24
-
25
- ## Validation tests
26
-
27
- The validation test suite makes assertions about the output of airbyte commands for the target version of the connector only.
28
-
29
- ## Tutorial(s)
30
-
31
- - [Loom Walkthrough (Airbyte Only)](https://www.loom.com/share/97c49d7818664b119cff6911a8a211a2?sid=4570a5b6-9c81-4db3-ba33-c74dc5845c3c)
32
- - [Internal Docs (Airbyte Only)](https://docs.google.com/document/d/1pzTxJTsooc9iQDlALjvOWtnq6yRTvzVtbkJxY4R36_I/edit)
33
-
34
- ### How to Use
35
-
36
- > ⚠️ **Note:** While you can use this tool without building a dev image, to achieve your goals you will likely need to have installed [airbyte-ci](https://github.com/airbytehq/airbyte/blob/master/airbyte-ci/connectors/pipelines/README.md) and know how to build a dev image.
37
-
38
- You can run the existing test suites with the following command:
39
-
40
- #### With local connection objects (`config.json`, `catalog.json`, `state.json`)
41
-
42
- ```bash
43
- poetry run pytest src/live_tests \
44
- --connector-image=airbyte/source-faker \
45
- --config-path=<path-to-config-path> \
46
- --catalog-path=<path-to-catalog-path> \
47
- --target-version=dev \
48
- --pr-url=<PR-URL> # The URL of the PR you are testing
49
- ```
50
-
51
- #### Using a live connection
52
-
53
- The live connection objects will be fetched.
54
-
55
- ```bash
56
- poetry run pytest src/live_tests \
57
- --connector-image=airbyte/source-faker \
58
- --target-version=dev \
59
- --pr-url=<PR-URL> # The URL of the PR you are testing
60
- ```
61
-
62
- You can also pass local connection objects path to override the live connection objects with `--config-path`, `--state-path` or `--catalog-path`.
63
-
64
- #### Test artifacts
65
-
66
- The test suite run will produce test artifacts in the `/tmp/regression_tests_artifacts/` folder.
67
- **They will get cleared after each test run on prompt exit. Please do not copy them elsewhere in your filesystem as they contain sensitive data that are not meant to be stored outside of your debugging session!**
68
-
69
- ##### Artifacts types
70
-
71
- - `report.html`: A report of the test run.
72
- - `stdout.log`: The collected standard output following the command execution
73
- - `stderr.log`: The collected standard error following the command execution
74
- - `http_dump.mitm`: An `mitmproxy` http stream log. Can be consumed with `mitmweb` (version `>=10`) for debugging.
75
- - `http_dump.har`: An `mitmproxy` http stream log in HAR format (a JSON encoded version of the mitm dump).
76
- - `airbyte_messages`: A directory containing `.jsonl` files for each message type (logs, records, traces, controls, states etc.) produced by the connector.
77
- - `duck.db`: A DuckDB database containing the messages produced by the connector.
78
- - `dagger.log`: The log of the Dagger session, useful for debugging errors unrelated to the tests.
79
-
80
- **Tests can also write specific artifacts like diffs under a directory named after the test function.**
81
-
82
- ```
83
- /tmp/regression_tests_artifacts
84
- └── session_1710754231
85
- ├── duck.db
86
- |── report.html
87
- ├── command_execution_artifacts
88
- │   └── source-orb
89
- │   ├── check
90
- │   │   ├── dev
91
- │   │   │   ├── airbyte_messages
92
- │   │   │   │   ├── connection_status.jsonl
93
- │   │   │   │   └── logs.jsonl
94
- │   │   │   ├── http_dump.har
95
- │   │   │   ├── http_dump.mitm
96
- │   │   │   ├── stderr.log
97
- │   │   │   └── stdout.log
98
- │   │   └── latest
99
- │   │   ├── airbyte_messages
100
- │   │   │   ├── connection_status.jsonl
101
- │   │   │   └── logs.jsonl
102
- │   │   ├── http_dump.har
103
- │   │   ├── http_dump.mitm
104
- │   │   ├── stderr.log
105
- │   │   └── stdout.log
106
- │   ├── discover
107
- │   │   ├── dev
108
- │   │   │   ├── airbyte_messages
109
- │   │   │   │   └── catalog.jsonl
110
- │   │   │   ├── http_dump.har
111
- │   │   │   ├── http_dump.mitm
112
- │   │   │   ├── stderr.log
113
- │   │   │   └── stdout.log
114
- │   │   └── latest
115
- │   │   ├── airbyte_messages
116
- │   │   │   └── catalog.jsonl
117
- │   │   ├── http_dump.har
118
- │   │   ├── http_dump.mitm
119
- │   │   ├── stderr.log
120
- │   │   └── stdout.log
121
- │   ├── read-with-state
122
- │   │   ├── dev
123
- │   │   │   ├── airbyte_messages
124
- │   │   │   │   ├── logs.jsonl
125
- │   │   │   │   ├── records.jsonl
126
- │   │   │   │   ├── states.jsonl
127
- │   │   │   │   └── traces.jsonl
128
- │   │   │   ├── http_dump.har
129
- │   │   │   ├── http_dump.mitm
130
- │   │   │   ├── stderr.log
131
- │   │   │   └── stdout.log
132
- │   │   └── latest
133
- │   │   ├── airbyte_messages
134
- │   │   │   ├── logs.jsonl
135
- │   │   │   ├── records.jsonl
136
- │   │   │   ├── states.jsonl
137
- │   │   │   └── traces.jsonl
138
- │   │   ├── http_dump.har
139
- │   │   ├── http_dump.mitm
140
- │   │   ├── stderr.log
141
- │   │   └── stdout.log
142
- │   └── spec
143
- │   ├── dev
144
- │   │   ├── airbyte_messages
145
- │   │   │   └── spec.jsonl
146
- │   │   ├── stderr.log
147
- │   │   └── stdout.log
148
- │   └── latest
149
- │   ├── airbyte_messages
150
- │   │   └── spec.jsonl
151
- │   ├── stderr.log
152
- │   └── stdout.log
153
- └── dagger.log
154
- ```
155
-
156
- #### HTTP Proxy and caching
157
-
158
- We use a containerized `mitmproxy` to capture the HTTP traffic between the connector and the source. Connector command runs produce `http_dump.mitm` (can be consumed with `mitmproxy` (version `>=10`) for debugging) and `http_dump.har` (a JSON encoded version of the mitm dump) artifacts.
159
- The traffic recorded on the control connector is passed to the target connector proxy to cache the responses for requests with the same URL. This is useful to avoid hitting the source API multiple times when running the same command on different versions of the connector.
160
-
161
- ### Custom CLI Arguments
162
-
163
- | Argument | Description | Required/Optional |
164
- |----------------------------|----------------------------------------------------------------------------------------------------------------------------------------------| ----------------- |
165
- | `--connector-image` | Docker image name of the connector to debug (e.g., `airbyte/source-faker`, `airbyte/source-faker`). | Required |
166
- | `--control-version` | Version of the control connector for regression testing. Must be an unambiguous connector version (e.g. 1.2.3 rather than `latest`) | Required |
167
- | `--target-version` | Version of the connector being tested. (Defaults to dev) | Optional |
168
- | `--pr-url` | URL of the pull request being tested. | Required |
169
- | `--connection-id` | ID of the connection for live testing. If not provided, a prompt will appear to choose. | Optional |
170
- | `--config-path` | Path to the custom source configuration file. | Optional |
171
- | `--catalog-path` | Path to the custom configured catalog file. | Optional |
172
- | `--state-path` | Path to the custom state file. | Optional |
173
- | `--http-cache` | Use the HTTP cache for the connector. | Optional |
174
- | `--run-id` | Unique identifier for the test run. If not provided, a timestamp will be used. | Optional |
175
- | `--auto-select-connection` | Automatically select a connection for testing. | Optional |
176
- | `--stream` | Name of the stream to test. Can be specified multiple times to test multiple streams. | Optional |
177
- | `--should-read-with-state` | Specify whether to read with state. If not provided, a prompt will appear to choose. | Optional |
178
- | `--disable-proxy` | Specify whether to disable proxy. If not provided, a proxy will be enabled. | Optional |
179
- | `--test-evaluation-mode` | Whether to run tests in "diagnostic" mode or "strict" mode. In diagnostic mode, eligible tests will always pass unless there's an exception. | Optional |
180
- | `--connection-subset` | The subset of connections to select from. Possible values are "sandboxes" or "all" (defaults to sandboxes). | Optional |
181
-
182
- ## Changelog
183
-
184
-
185
- ### 0.21.4
186
- Update connection id to use first 8 chars in the report
187
-
188
- ### 0.21.3
189
- Update dependencies to avoid genson issue
190
-
191
- ### 0.21.2
192
- Fix selected streams filter in regression tests
193
-
194
- ### 0.21.1
195
- Update Python version requirement from 3.10 to 3.11.
196
-
197
- ### 0.21.0
198
- Add `disable_proxy` flag
199
-
200
-
201
- ### 0.20.0
202
- Support multiple connection objects in the regression tests suite.
203
-
204
-
205
- ### 0.19.10
206
- Pin the connection retriever until we make required changes to support the new version.
207
-
208
-
209
- ### 0.19.8
210
-
211
- Give ownership of copied connection object files to the image user to make sure it has permission to write them (config migration).
212
-
213
- ### 0.19.7
214
-
215
- Mount connection objects to readable paths in the container for rootless images.
216
-
217
- ### 0.19.6
218
-
219
- Write connector output to a different in container path to avoid permission issues now that connector images are rootless.
220
-
221
- ### 0.19.5
222
-
223
- Fix `ZeroDivisionError` in Regression test tool
224
-
225
- ### 0.19.4
226
-
227
- Update `connection_retriever` to 0.7.4
228
-
229
- ### 0.19.3
230
-
231
- Update `get_container_from_id` with the correct new Dagger API.
232
-
233
- ### 0.19.2
234
-
235
- Update Dagger to 0.13.3
236
-
237
- ### 0.19.1
238
-
239
- Fixed the `UserDict` type annotation not found bug.
240
-
241
- ### 0.19.0
242
-
243
- Delete the `debug`command.
244
-
245
- ### 0.18.8
246
-
247
- Improve error message when failing to retrieve connection.
248
- Ask to double-check that a sync ran with the control version on the selected connection.
249
-
250
- ### 0.18.7
251
-
252
- Improve error message when failing to retrieve connection.
253
-
254
- ### 0.18.6
255
-
256
- Disable the `SortQueryParams` MITM proxy addon to avoid double URL encoding.
257
-
258
- ### 0.18.5
259
-
260
- Relax test_oneof_usage criteria for constant value definitions in connector SPEC output.
261
-
262
- ### 0.18.4
263
-
264
- Bugfix: Use connection-retriever 0.7.2
265
-
266
- ### 0.18.3
267
-
268
- Updated dependencies.
269
-
270
- ### 0.18.2
271
-
272
- Allow live tests with or without state in CI.
273
-
274
- ### 0.18.1
275
-
276
- Fix extra argument.
277
-
278
- ### 0.18.0
279
-
280
- Add support for selecting from a subset of connections.
281
-
282
- ### 0.17.8
283
-
284
- Fix the self-signed certificate path we bind to Python connectors.
285
-
286
- ### 0.17.7
287
-
288
- Explicitly pass the control version to the connection retriever. Defaults to the latest released version of the connector under test.
289
-
290
- ### 0.17.6
291
-
292
- Display diagnostic test with warning.
293
-
294
- ### 0.17.5
295
-
296
- Performance improvements using caching.
297
-
298
- ### 0.17.4
299
-
300
- Fix control image when running tests in CI.
301
-
302
- ### 0.17.3
303
-
304
- Pin requests dependency.
305
-
306
- ### 0.17.2
307
-
308
- Fix duckdb dependency.
309
-
310
- ### 0.17.1
311
-
312
- Bump the connection-retriever version to fix deprecated query.
313
-
314
- ### 0.17.0
315
-
316
- Enable running in GitHub actions.
317
-
318
- ### 0.16.0
319
-
320
- Enable running with airbyte-ci.
321
-
322
- ### 0.15.0
323
-
324
- Automatic retrieval of connection objects for regression tests. The connection id is not required anymore.
325
-
326
- ### 0.14.2
327
-
328
- Fix KeyError when target & control streams differ.
329
-
330
- ### 0.14.1
331
-
332
- Improve performance when reading records per stream.
333
-
334
- ### 0.14.0
335
-
336
- Track usage via Segment.
337
-
338
- ### 0.13.0
339
-
340
- Show test docstring in the test report.
341
-
342
- ### 0.12.0
343
-
344
- Implement a test to compare schema inferred on both control and target version.
345
-
346
- ### 0.11.0
347
-
348
- Create a global duckdb instance to store messages produced by the connector in target and control version.
349
-
350
- ### 0.10.0
351
-
352
- Show record count per stream in report and list untested streams.
353
-
354
- ### 0.9.0
355
-
356
- Make the regressions tests suite better at handling large connector outputs.
357
-
358
- ### 0.8.1
359
-
360
- Improve diff output.
361
-
362
- ### 0.8.0
363
-
364
- Regression tests: add an HTML report.
365
-
366
- ### 0.7.0
367
-
368
- Improve the proxy workflow and caching logic + generate HAR files.
369
-
370
- ### 0.6.6
371
-
372
- Exit pytest if connection can't be retrieved.
373
-
374
- ### 0.6.6
375
-
376
- Cleanup debug files when prompt is closed.
377
-
378
- ### 0.6.5
379
-
380
- Improve ConnectorRunner logging.
381
-
382
- ### 0.6.4
383
-
384
- Add more data integrity checks to the regression tests suite.
385
-
386
- ### 0.6.3
387
-
388
- Make catalog diffs more readable.
389
-
390
- ### 0.6.2
391
-
392
- Clean up regression test artifacts on any exception.
393
-
394
- ### 0.6.1
395
-
396
- Modify diff output for `discover` and `read` tests.
397
-
398
- ### 0.5.1
399
-
400
- Handle connector command execution errors.
401
-
402
- ### 0.5.0
403
-
404
- Add new tests and confirmation prompts.
405
-
406
- ### 0.4.0
407
-
408
- Introduce DuckDB to store the messages produced by the connector.
409
-
410
- ### 0.3.0
411
-
412
- Pass connection id to the regression tests suite.
413
-
414
- ### 0.2.0
415
-
416
- Declare the regression tests suite.
417
-
418
- ### 0.1.0
419
-
420
- Implement initial primitives and a `debug` command to run connector commands and persist the outputs to local storage.
@@ -1,2 +0,0 @@
1
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
2
-
@@ -1 +0,0 @@
1
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
@@ -1,8 +0,0 @@
1
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
2
- from __future__ import annotations
3
-
4
- from .base_backend import BaseBackend
5
- from .duckdb_backend import DuckDbBackend
6
- from .file_backend import FileBackend
7
-
8
- __all__ = ["BaseBackend", "DuckDbBackend", "FileBackend"]
@@ -1,16 +0,0 @@
1
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
2
- from __future__ import annotations
3
-
4
- from abc import ABC, abstractmethod
5
- from collections.abc import Iterable
6
-
7
- from airbyte_protocol.models import AirbyteMessage # type: ignore
8
-
9
-
10
- class BaseBackend(ABC):
11
- """
12
- Interface to be shared between the file backend and the database backend(s)
13
- """
14
-
15
- @abstractmethod
16
- def write(self, airbyte_messages: Iterable[AirbyteMessage]) -> None: ...
@@ -1,87 +0,0 @@
1
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
2
- from __future__ import annotations
3
-
4
- import logging
5
- import re
6
- from collections.abc import Iterable
7
- from pathlib import Path
8
- from typing import Optional
9
-
10
- import duckdb
11
- from airbyte_protocol.models import AirbyteMessage # type: ignore
12
- from live_tests.commons.backends.file_backend import FileBackend
13
-
14
-
15
- class DuckDbBackend(FileBackend):
16
- SAMPLE_SIZE = -1
17
-
18
- def __init__(
19
- self,
20
- output_directory: Path,
21
- duckdb_path: Path,
22
- schema: Optional[Iterable[str]] = None,
23
- ):
24
- super().__init__(output_directory)
25
- self.duckdb_path = duckdb_path
26
- self.schema = schema
27
-
28
- @property
29
- def jsonl_files_to_insert(self) -> Iterable[Path]:
30
- return [
31
- self.jsonl_catalogs_path,
32
- self.jsonl_connection_status_path,
33
- self.jsonl_specs_path,
34
- self.jsonl_states_path,
35
- self.jsonl_traces_path,
36
- self.jsonl_logs_path,
37
- self.jsonl_controls_path,
38
- self.jsonl_records_path,
39
- ]
40
-
41
- @staticmethod
42
- def sanitize_table_name(table_name: str) -> str:
43
- sanitized = str(table_name).replace(" ", "_")
44
- sanitized = re.sub(r"[^\w\s]", "", sanitized)
45
- if sanitized and sanitized[0].isdigit():
46
- sanitized = "_" + sanitized
47
- return sanitized
48
-
49
- def write(self, airbyte_messages: Iterable[AirbyteMessage]) -> None:
50
- # Use the FileBackend to write the messages to disk as jsonl files
51
- super().write(airbyte_messages)
52
- duck_db_conn = duckdb.connect(str(self.duckdb_path))
53
-
54
- if self.schema:
55
- sanitized_schema_name = "_".join(
56
- [self.sanitize_table_name(s) for s in self.schema]
57
- )
58
- duck_db_conn.sql(f"CREATE SCHEMA IF NOT EXISTS {sanitized_schema_name}")
59
- duck_db_conn.sql(f"USE {sanitized_schema_name}")
60
- logging.info(f"Using schema {sanitized_schema_name}")
61
-
62
- for json_file in self.jsonl_files_to_insert:
63
- if json_file.exists():
64
- table_name = self.sanitize_table_name(json_file.stem)
65
- logging.info(
66
- f"Creating table {table_name} from {json_file} in schema {sanitized_schema_name}"
67
- )
68
- duck_db_conn.sql(
69
- f"CREATE TABLE {table_name} AS SELECT * FROM read_json_auto('{json_file}', sample_size = {self.SAMPLE_SIZE}, format = 'newline_delimited')"
70
- )
71
- logging.info(
72
- f"Table {table_name} created in schema {sanitized_schema_name}"
73
- )
74
-
75
- for json_file in self.record_per_stream_paths_data_only.values():
76
- if json_file.exists():
77
- table_name = self.sanitize_table_name(f"records_{json_file.stem}")
78
- logging.info(
79
- f"Creating table {table_name} from {json_file} in schema {sanitized_schema_name} to store stream records with the data field only"
80
- )
81
- duck_db_conn.sql(
82
- f"CREATE TABLE {self.sanitize_table_name(table_name)} AS SELECT * FROM read_json_auto('{json_file}', sample_size = {self.SAMPLE_SIZE}, format = 'newline_delimited')"
83
- )
84
- logging.info(
85
- f"Table {table_name} created in schema {sanitized_schema_name}"
86
- )
87
- duck_db_conn.close()