unstructured-ingest 0.0.25__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +42 -0
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +15 -0
- test/integration/connectors/databricks_tests/__init__.py +0 -0
- test/integration/connectors/databricks_tests/test_volumes_native.py +165 -0
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_postgres.py +178 -0
- test/integration/connectors/sql/test_sqlite.py +151 -0
- test/integration/connectors/test_s3.py +152 -0
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +7 -0
- test/integration/connectors/utils/docker_compose.py +44 -0
- test/integration/connectors/utils/validation.py +203 -0
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +13 -0
- test/integration/embedders/test_bedrock.py +49 -0
- test/integration/embedders/test_huggingface.py +26 -0
- test/integration/embedders/test_mixedbread.py +47 -0
- test/integration/embedders/test_octoai.py +41 -0
- test/integration/embedders/test_openai.py +41 -0
- test/integration/embedders/test_vertexai.py +41 -0
- test/integration/embedders/test_voyageai.py +41 -0
- test/integration/embedders/togetherai.py +43 -0
- test/integration/embedders/utils.py +44 -0
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +75 -0
- test/integration/utils.py +15 -0
- test/unit/__init__.py +0 -0
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +41 -0
- test/unit/embed/test_octoai.py +20 -0
- test/unit/embed/test_openai.py +20 -0
- test/unit/embed/test_vertexai.py +25 -0
- test/unit/embed/test_voyageai.py +24 -0
- test/unit/test_chunking_utils.py +36 -0
- test/unit/test_error.py +27 -0
- test/unit/test_interfaces.py +280 -0
- test/unit/test_interfaces_v2.py +26 -0
- test/unit/test_logger.py +78 -0
- test/unit/test_utils.py +164 -0
- test/unit/test_utils_v2.py +82 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/interfaces.py +2 -2
- unstructured_ingest/connector/notion/types/block.py +1 -0
- unstructured_ingest/connector/notion/types/database.py +1 -0
- unstructured_ingest/connector/notion/types/page.py +1 -0
- unstructured_ingest/embed/bedrock.py +0 -20
- unstructured_ingest/embed/huggingface.py +0 -21
- unstructured_ingest/embed/interfaces.py +29 -3
- unstructured_ingest/embed/mixedbreadai.py +0 -36
- unstructured_ingest/embed/octoai.py +2 -24
- unstructured_ingest/embed/openai.py +0 -20
- unstructured_ingest/embed/togetherai.py +40 -0
- unstructured_ingest/embed/vertexai.py +0 -20
- unstructured_ingest/embed/voyageai.py +1 -24
- unstructured_ingest/interfaces.py +1 -1
- unstructured_ingest/v2/cli/utils/click.py +21 -2
- unstructured_ingest/v2/interfaces/connector.py +22 -2
- unstructured_ingest/v2/interfaces/downloader.py +1 -0
- unstructured_ingest/v2/processes/chunker.py +1 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +5 -18
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +175 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +17 -0
- unstructured_ingest/v2/processes/connectors/kdbai.py +14 -6
- unstructured_ingest/v2/processes/connectors/mongodb.py +223 -3
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +13 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +177 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +310 -0
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +172 -0
- unstructured_ingest/v2/processes/embedder.py +13 -0
- unstructured_ingest/v2/processes/partitioner.py +2 -1
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/METADATA +16 -14
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/RECORD +85 -31
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/top_level.txt +1 -0
- unstructured_ingest/v2/processes/connectors/sql.py +0 -275
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/entry_points.txt +0 -0
|
@@ -1,7 +1,51 @@
|
|
|
1
|
+
test/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
test/integration/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
test/integration/utils.py,sha256=CWqzEGw6TA_ZoP9hRUkW64TWYssooBbufcTRmbJvod8,401
|
|
4
|
+
test/integration/chunkers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
+
test/integration/chunkers/test_chunkers.py,sha256=pqn1Rqh36jZTJL4qpU0iuOMFAEQ-LrKAPOgWtQMAt_I,1482
|
|
6
|
+
test/integration/connectors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
test/integration/connectors/conftest.py,sha256=Q8ScDzrzO2o-8D_kYFt8LL7QAhoFTRRtKJKMc2hLMcI,345
|
|
8
|
+
test/integration/connectors/test_s3.py,sha256=fK0soCTkNxp-4hm4O2LPrhlZXvYmaeTmeEgeNh1b0k8,5839
|
|
9
|
+
test/integration/connectors/databricks_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
+
test/integration/connectors/databricks_tests/test_volumes_native.py,sha256=kS45mnNu9_U4qV3cxByEFXCYLEBWRy-fxxhzR3r93cs,5685
|
|
11
|
+
test/integration/connectors/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
+
test/integration/connectors/sql/test_postgres.py,sha256=A9vWj5pBdoEyL2m6d3e2Ep8ZZcnLhdXkaHPPlkTStbg,6581
|
|
13
|
+
test/integration/connectors/sql/test_sqlite.py,sha256=F6Ljb6npmFZlq_5pvJj-0Hkk2mC3T-pMAGyhDm1UtM4,5702
|
|
14
|
+
test/integration/connectors/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
|
+
test/integration/connectors/utils/constants.py,sha256=0zSPnsZVqJuNhXduXvdXFQLZTRIQa5Fo_1qjBYVCfb8,209
|
|
16
|
+
test/integration/connectors/utils/docker_compose.py,sha256=6XeYOKQFZCBRLEmcgH2mmBAaVs6R6jCWAhJLjq6p-aM,1771
|
|
17
|
+
test/integration/connectors/utils/validation.py,sha256=Sf0ELATWG5K3E3d5S_ArtZeFFYdzoI5jN86U4DiqNyw,8422
|
|
18
|
+
test/integration/embedders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
19
|
+
test/integration/embedders/conftest.py,sha256=B2W771RbijR7G_GybsCzRyIvOzXqzbKZdRIlNDd5AGY,334
|
|
20
|
+
test/integration/embedders/test_bedrock.py,sha256=0oBRNS_DtFDGQ22Z1T3t6VOJ31PrItgvnJpqcLe9Fg4,1903
|
|
21
|
+
test/integration/embedders/test_huggingface.py,sha256=0mMTOO-Nh7KB70AGs_7LLQIxMYrnSPqyihriUeqACbM,1007
|
|
22
|
+
test/integration/embedders/test_mixedbread.py,sha256=RrLv8SByMNXsgrlh94RbaT-VyxZ4-DILO-OPpmOwvSI,1441
|
|
23
|
+
test/integration/embedders/test_octoai.py,sha256=LnR0BLttamW5PGid6jFxATDAi0x7hq5iWMXurbHP6TI,1328
|
|
24
|
+
test/integration/embedders/test_openai.py,sha256=0jlFqEeeCneIWX9tGyC3TXeUNqsMXR7u5n7uEIaAQKo,1328
|
|
25
|
+
test/integration/embedders/test_vertexai.py,sha256=OtoFzmrWWhGIO5Bbl5zt_4sp6qRHZxtaDQKpGcfzNLM,1345
|
|
26
|
+
test/integration/embedders/test_voyageai.py,sha256=Zqf7nn1AxfBDBr5A9Jr-5pxes4QNvfKiyeGexCCm4nY,1346
|
|
27
|
+
test/integration/embedders/togetherai.py,sha256=0W1ScD5yb1D9hPC2ewUsuCHLUOpCuM083YMBhqAI9fw,1395
|
|
28
|
+
test/integration/embedders/utils.py,sha256=3AMKMBpgBep_0jFqrqMHH8BJo6w60kpouSZ5JPJTwIA,1850
|
|
29
|
+
test/integration/partitioners/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
|
+
test/integration/partitioners/test_partitioner.py,sha256=KEpnhsz2YNAoQ2UZGOTsi1_uk1h4Vg-gGTsy5Fe9OCw,2846
|
|
31
|
+
test/unit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
32
|
+
test/unit/test_chunking_utils.py,sha256=0iPwfnMPpyTm-yOE0BXMnEQQP4iguS6NhOqgMQU5nhk,1390
|
|
33
|
+
test/unit/test_error.py,sha256=RflmngCdFNKOLXVfLnUdNfY3Mfg3k7DTEzfIl0B-syU,840
|
|
34
|
+
test/unit/test_interfaces.py,sha256=XNj8qasc1ltaeUv-2y31rv7R9xquo0rgRrMvBZoNZLw,9623
|
|
35
|
+
test/unit/test_interfaces_v2.py,sha256=nyxUsRX1M6Mfhux7SqEhal85PIaWO5xhm6ZTcqpPpHI,790
|
|
36
|
+
test/unit/test_logger.py,sha256=0SKndXE_VRd8XmUHkrj7zuBQHZscXx3ZQllMEOvtF9Y,2380
|
|
37
|
+
test/unit/test_utils.py,sha256=xJ9WGpHBihWpQWvIzd6z99UIdZJba8U7c31h3q6C9To,4800
|
|
38
|
+
test/unit/test_utils_v2.py,sha256=TWVAeE0OrcHgPyzGPtEnQakICsVrDeVhIKPMRQPX554,2638
|
|
39
|
+
test/unit/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
40
|
+
test/unit/embed/test_mixedbreadai.py,sha256=XFNJDP5pIgF3eQYwBiuEWmH3zZWx72Wpwyv-Q4m0DJg,1332
|
|
41
|
+
test/unit/embed/test_octoai.py,sha256=Ha9EgAW64Q45hFj51tToe8RyKXWXwqAkdDqSFDMu37Q,831
|
|
42
|
+
test/unit/embed/test_openai.py,sha256=0O1yshDcE0BMKv1yJqrNuiNLSdPhLpKqJ-D_wmnidsM,831
|
|
43
|
+
test/unit/embed/test_vertexai.py,sha256=Pl7COc9E3tf_yGidkTEmTizNGyZF1F5zuL2TgPTMnfI,1048
|
|
44
|
+
test/unit/embed/test_voyageai.py,sha256=DviCOJFhe5H4e26-kNyX3JNe8h3qB5Yl0KOe8rQEMrc,981
|
|
1
45
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
2
|
-
unstructured_ingest/__version__.py,sha256=
|
|
46
|
+
unstructured_ingest/__version__.py,sha256=ch9Ch304-rlC6iFyomBT7OHb9bvtQNzaejmd5QwbzKE,42
|
|
3
47
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
4
|
-
unstructured_ingest/interfaces.py,sha256=
|
|
48
|
+
unstructured_ingest/interfaces.py,sha256=m03BgenxSA34HbW157L7V9TGxK_dTG7N2AnAhF31W-U,31364
|
|
5
49
|
unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
|
|
6
50
|
unstructured_ingest/main.py,sha256=82G_7eG4PNhc_xIqj4Y_sFbDV9VI-nwSfsfJQMzovMk,169
|
|
7
51
|
unstructured_ingest/processor.py,sha256=XKKrvbxsb--5cDzz4hB3-GfWZYyIjJ2ah8FpzQKF_DM,2760
|
|
@@ -9,7 +53,7 @@ unstructured_ingest/cli/__init__.py,sha256=9kNcBOHuXON5lB1MJU9QewEhwPmId56vXqB29
|
|
|
9
53
|
unstructured_ingest/cli/cli.py,sha256=LutBTBYMqboKw8cputHVszpenyfnySzcUC15ifwuYyg,1049
|
|
10
54
|
unstructured_ingest/cli/cmd_factory.py,sha256=UdHm1KacTombpF6DxyTSwTCuApsKHUYw_kVu5Nhcy3Y,364
|
|
11
55
|
unstructured_ingest/cli/common.py,sha256=I0El08FHz5kxw7iz0VWOWPrvcJD1rBgXJSwVIpVmmwU,204
|
|
12
|
-
unstructured_ingest/cli/interfaces.py,sha256=
|
|
56
|
+
unstructured_ingest/cli/interfaces.py,sha256=lpaaOdAQ4NMsawVaHSk5lXCcZ0Mw85kRzfElu1ODCB0,24090
|
|
13
57
|
unstructured_ingest/cli/utils.py,sha256=KNhkFNKOeEihc8HlvMz_MTbYVQNFklrBKbC8xg9h1xE,7982
|
|
14
58
|
unstructured_ingest/cli/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
59
|
unstructured_ingest/cli/base/cmd.py,sha256=BbfjA2v203Jh-7DL6bzxQ7fOeNixd5BsBMuzXz6h5IQ,583
|
|
@@ -106,11 +150,11 @@ unstructured_ingest/connector/notion/connector.py,sha256=8A9d-Pej-uXzjEy85zUloxI
|
|
|
106
150
|
unstructured_ingest/connector/notion/helpers.py,sha256=-eEB8eSqdD5bWX_QEA2hZz1siucC0FNEUEqCEJptiVk,20702
|
|
107
151
|
unstructured_ingest/connector/notion/interfaces.py,sha256=SrTT-9c0nvk0fMqVgudYF647r04AdMKi6wkIkMy7Szw,563
|
|
108
152
|
unstructured_ingest/connector/notion/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
109
|
-
unstructured_ingest/connector/notion/types/block.py,sha256=
|
|
110
|
-
unstructured_ingest/connector/notion/types/database.py,sha256=
|
|
153
|
+
unstructured_ingest/connector/notion/types/block.py,sha256=w3j3F_z-50dpIpBt4Ib8_U4eINZRFMfGHdbE3hjkbu0,3028
|
|
154
|
+
unstructured_ingest/connector/notion/types/database.py,sha256=1SrP5sxWhif8dxCteXnJAFX2HwoXI2EJy9IRIzM_nGM,2570
|
|
111
155
|
unstructured_ingest/connector/notion/types/date.py,sha256=Ah0ekF18S_9xVDT2Ps1NGD1eOihtInGIYji_BDIalig,729
|
|
112
156
|
unstructured_ingest/connector/notion/types/file.py,sha256=xc5UQ46qWvVd3SkKJctRBqMVERCNc_UVVc21pu66IME,1291
|
|
113
|
-
unstructured_ingest/connector/notion/types/page.py,sha256=
|
|
157
|
+
unstructured_ingest/connector/notion/types/page.py,sha256=oZS5ausaiA68Ux-i6mOA0qYywP0X7YchXL8gWscMaxQ,1427
|
|
114
158
|
unstructured_ingest/connector/notion/types/parent.py,sha256=VTNyL5JNVLb5AqR5P-c658DC9bUgkRKPA9fI2CFZWoU,1695
|
|
115
159
|
unstructured_ingest/connector/notion/types/rich_text.py,sha256=V0fqXLAq7H5A6Av0IM8TqqhqW45VWD8K79sHdh1FyA8,5450
|
|
116
160
|
unstructured_ingest/connector/notion/types/user.py,sha256=VK-XYFt2WdtEqm_LGnmE22ms7xw84ia3pSBhpmy2IKg,1800
|
|
@@ -167,14 +211,15 @@ unstructured_ingest/connector/notion/types/database_properties/unique_id.py,sha2
|
|
|
167
211
|
unstructured_ingest/connector/notion/types/database_properties/url.py,sha256=iXQ2tVUm9UlKVtDA0NQiFIRJ5PHYW9wOaWt2vFfSVCg,862
|
|
168
212
|
unstructured_ingest/connector/notion/types/database_properties/verification.py,sha256=J_DLjY-v2T6xDGMQ7FkI0YMKMA6SG6Y3yYW7qUD1hKA,2334
|
|
169
213
|
unstructured_ingest/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
170
|
-
unstructured_ingest/embed/bedrock.py,sha256
|
|
171
|
-
unstructured_ingest/embed/huggingface.py,sha256=
|
|
172
|
-
unstructured_ingest/embed/interfaces.py,sha256=
|
|
173
|
-
unstructured_ingest/embed/mixedbreadai.py,sha256=
|
|
174
|
-
unstructured_ingest/embed/octoai.py,sha256=
|
|
175
|
-
unstructured_ingest/embed/openai.py,sha256=
|
|
176
|
-
unstructured_ingest/embed/
|
|
177
|
-
unstructured_ingest/embed/
|
|
214
|
+
unstructured_ingest/embed/bedrock.py,sha256=-PRdZsF44vwi6G4G75gdO31AJKfZWClOXkJQAk7rEO8,3096
|
|
215
|
+
unstructured_ingest/embed/huggingface.py,sha256=2cBiQhOhfWHX3hS-eKjocysOkUaRlyRfUj9Kxjrp6cE,1934
|
|
216
|
+
unstructured_ingest/embed/interfaces.py,sha256=au4Xp8ciDvo4bidlUbazFW2aC7NZW5-UDLKXBFVzAX4,2025
|
|
217
|
+
unstructured_ingest/embed/mixedbreadai.py,sha256=OwFWWukvkQaXhjgs6b6N6D4w7sYrtcHNhsHAj-Bocj4,4268
|
|
218
|
+
unstructured_ingest/embed/octoai.py,sha256=jHytDfQgup0v1PBcmlMv1nIh9Obg8WGO5qtLmN-Ot5g,1473
|
|
219
|
+
unstructured_ingest/embed/openai.py,sha256=JXo4boivNoo2lBzHuS4Z0FZ1zlgUGAPVt0X3HY540ZU,1282
|
|
220
|
+
unstructured_ingest/embed/togetherai.py,sha256=BL7NzExSE-laQqrp4ybUgoZ9JG_eop4hk-s2yCO_d5c,1451
|
|
221
|
+
unstructured_ingest/embed/vertexai.py,sha256=X5bGJdXyR5nAFH_ocAVgEowmd60nOBykyfclYo3VfBM,2808
|
|
222
|
+
unstructured_ingest/embed/voyageai.py,sha256=bjom9QqWmH1Mv08ewg8ZG7gO3rQPMVS0_ztm2KBAOjI,1821
|
|
178
223
|
unstructured_ingest/enhanced_dataclass/__init__.py,sha256=gDZOUsv5eo-8jm4Yu7DdDwi101aGbfG7JctTdOYnTOM,151
|
|
179
224
|
unstructured_ingest/enhanced_dataclass/core.py,sha256=d6aUkDynuKX87cHx9_N5UDUWrvISR4jYRFRTvd_avlI,3038
|
|
180
225
|
unstructured_ingest/enhanced_dataclass/dataclasses.py,sha256=aZMsoCzAGRb8Rmh3BTSBFtNr6FmFTY93KYGLk3gYJKQ,1949
|
|
@@ -276,11 +321,11 @@ unstructured_ingest/v2/cli/base/dest.py,sha256=zDjqek7anr0JQ2ptEl8KIAsUXuCuHRnBQ
|
|
|
276
321
|
unstructured_ingest/v2/cli/base/importer.py,sha256=nRt0QQ3qpi264-n_mR0l55C2ddM8nowTNzT1jsWaam8,1128
|
|
277
322
|
unstructured_ingest/v2/cli/base/src.py,sha256=cpQ43qQju4e5s_YSaPxUtA70BaisRkTBdjtlPhqn5Mg,2872
|
|
278
323
|
unstructured_ingest/v2/cli/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
279
|
-
unstructured_ingest/v2/cli/utils/click.py,sha256=
|
|
324
|
+
unstructured_ingest/v2/cli/utils/click.py,sha256=HCEcdHf8Lck0zcx3kidKjLbHDHXIBxPRL2MGgtKtDlg,6967
|
|
280
325
|
unstructured_ingest/v2/cli/utils/model_conversion.py,sha256=uJQKpbTC5ysOdVaRq2SWEjG8btBimVZYzX9NVL7xnzs,7500
|
|
281
326
|
unstructured_ingest/v2/interfaces/__init__.py,sha256=Rfa8crx6De7WNOK-EjsWWwFVpsUfCc6gY8B8tQ3ae9I,899
|
|
282
|
-
unstructured_ingest/v2/interfaces/connector.py,sha256=
|
|
283
|
-
unstructured_ingest/v2/interfaces/downloader.py,sha256=
|
|
327
|
+
unstructured_ingest/v2/interfaces/connector.py,sha256=qUFFJ3qgDMenTCZMtVRjq1DIwsVak6pxNjQOH2eVkMw,1623
|
|
328
|
+
unstructured_ingest/v2/interfaces/downloader.py,sha256=Lj3nTY1hPA71GfNeedFVCdHdZsHLle8qrx5RtXAy9GY,2940
|
|
284
329
|
unstructured_ingest/v2/interfaces/file_data.py,sha256=ieJK-hqHCEOmoYNGoFbCHziSaZyMtRS9VpSoYbwoKCE,1944
|
|
285
330
|
unstructured_ingest/v2/interfaces/indexer.py,sha256=Bd1S-gTLsxhJBLEh1lYm_gXqwQLaEZMoqPq9yGxtN_E,713
|
|
286
331
|
unstructured_ingest/v2/interfaces/process.py,sha256=BgglTu5K93FnDDopZKKr_rkK2LTZOguR6kcQjKHjF40,392
|
|
@@ -302,13 +347,13 @@ unstructured_ingest/v2/pipeline/steps/stage.py,sha256=cphKgHScLz2rNLZRI5Olsb6dAH
|
|
|
302
347
|
unstructured_ingest/v2/pipeline/steps/uncompress.py,sha256=CFSy4tGp6BAvF0oIwWFN8v4zFzh5pRDeESjEn5iP9hE,1756
|
|
303
348
|
unstructured_ingest/v2/pipeline/steps/upload.py,sha256=zlgXgwReX9TBOdfTpS9hETah4SeSmzPB2g8dAGfLIvM,1987
|
|
304
349
|
unstructured_ingest/v2/processes/__init__.py,sha256=FaHWSCGyc7GWVnAsNEUUj7L8hT8gCVY3_hUE2VzWtUg,462
|
|
305
|
-
unstructured_ingest/v2/processes/chunker.py,sha256=
|
|
350
|
+
unstructured_ingest/v2/processes/chunker.py,sha256=1bfJ2qgl6qu2HvClzHbC7-q5QtUp7mrlNxZxnPGYTm0,5479
|
|
306
351
|
unstructured_ingest/v2/processes/connector_registry.py,sha256=vkEe6jpgdYtZCxMj59s5atWGgmPuxAEXRUoTt-MJ7wc,2198
|
|
307
|
-
unstructured_ingest/v2/processes/embedder.py,sha256=
|
|
352
|
+
unstructured_ingest/v2/processes/embedder.py,sha256=PQn0IO8xbGRQHpcT2VVl-J8gTJ5HGGEP9gdEAwMVK3U,6498
|
|
308
353
|
unstructured_ingest/v2/processes/filter.py,sha256=kjUmMw2SDq2bme0JCAOxs6cJriIG6Ty09KOznS-xz08,2145
|
|
309
|
-
unstructured_ingest/v2/processes/partitioner.py,sha256=
|
|
354
|
+
unstructured_ingest/v2/processes/partitioner.py,sha256=2Lhztd730soVC2TOqrn_ba7CGZna8AHHpqJY2ZUYVxE,7776
|
|
310
355
|
unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2jLuBge8KoK4nxhqKag,2425
|
|
311
|
-
unstructured_ingest/v2/processes/connectors/__init__.py,sha256=
|
|
356
|
+
unstructured_ingest/v2/processes/connectors/__init__.py,sha256=glyowqb93_NNreQXoRLbF0PvzMc6Ptv0ARfl3xfSH4E,4967
|
|
312
357
|
unstructured_ingest/v2/processes/connectors/airtable.py,sha256=Yi7PEv_FejZ9_y3BPY3gu5YGVfeLh-9YX-qLyQHjJsY,8921
|
|
313
358
|
unstructured_ingest/v2/processes/connectors/astradb.py,sha256=ZctZRfXcOAMBGPkKgHvhTmV_-2F0YN5vqwfY9UCHIlU,5791
|
|
314
359
|
unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py,sha256=S55v7TXu30rEdgythMBB_2VcuomyMPmcPtLYykbhw_E,8466
|
|
@@ -317,10 +362,10 @@ unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=SONLywyEfoAlLc-H
|
|
|
317
362
|
unstructured_ingest/v2/processes/connectors/databricks_volumes.py,sha256=BQHHpCDwE51inD3pZF4tL4zLr7lv6iBcwnA1NazrHqY,9423
|
|
318
363
|
unstructured_ingest/v2/processes/connectors/elasticsearch.py,sha256=ojxMUHkLa6ZG50aTGn2YWhDHZ1n38uFRn5p8_ghAIvM,16762
|
|
319
364
|
unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=7xOQthcqBd9auJxB0nxZlhh1vdjXpMX_CtQZa6YfZz0,13088
|
|
320
|
-
unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=
|
|
365
|
+
unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=8bGHbZctJ_Tl1AUSMnI7CCZ7CgEtTRVcRuvlB1HPlqQ,5907
|
|
321
366
|
unstructured_ingest/v2/processes/connectors/local.py,sha256=a3stgnIkhBbXPIQD0O-RaRM-Eb-szHj9Yy4Fz881-9c,6723
|
|
322
367
|
unstructured_ingest/v2/processes/connectors/milvus.py,sha256=ZUlyAQyTt0U1JoapFYHQW3IIaGYY50b3URDSLEAFjtk,7687
|
|
323
|
-
unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=
|
|
368
|
+
unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=A0pt6JcNTD5bEu79jZ8KhnHcBQ2VUJ2AjtQAtdFr_Lo,13175
|
|
324
369
|
unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=ZiUo-dFo1LMOvFwphSLRZiR1PcrN8GWLTHhsh4TU6n0,9207
|
|
325
370
|
unstructured_ingest/v2/processes/connectors/opensearch.py,sha256=dfDSNrWIEk19wuHdlMJpp_SLMOteNPlkDBPlAwu1LVY,6767
|
|
326
371
|
unstructured_ingest/v2/processes/connectors/outlook.py,sha256=NK67Pd8Nk5oUIXTK-sK18K7rZ_Cl0UuCbeF2ExBEZho,9294
|
|
@@ -328,21 +373,30 @@ unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=k_GH55S_OQ6-wCLC6
|
|
|
328
373
|
unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
|
|
329
374
|
unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=hOaV5gBcHFc6N5Rbu3MgM-5Aol1ht-QkNIN4PqjvfxE,19665
|
|
330
375
|
unstructured_ingest/v2/processes/connectors/singlestore.py,sha256=4rVvWKK2iQr03Ff6cB5zjfE1MpN0JyIGpCxxFCDI6hc,5563
|
|
331
|
-
unstructured_ingest/v2/processes/connectors/sql.py,sha256=srj2ECKnkGR_iEFBdpa8sxw3ACCvJ5L0uoKCuHxKUe4,9204
|
|
332
376
|
unstructured_ingest/v2/processes/connectors/utils.py,sha256=8kd0g7lo9NqnpaIkjeO-Ut6erhwUNH_gS9koevpe3WE,878
|
|
333
377
|
unstructured_ingest/v2/processes/connectors/weaviate.py,sha256=Ss0YyD5T6k-00eJ6dr5lSo2H0LcOjVTMmozehyTvnAo,8866
|
|
378
|
+
unstructured_ingest/v2/processes/connectors/databricks/__init__.py,sha256=jO71UTC7bLA_N12CrLWJzh_yZML5gfT7VohxzCpUGWg,1848
|
|
379
|
+
unstructured_ingest/v2/processes/connectors/databricks/volumes.py,sha256=db4PxE1LiKWUq0b9THABFRChArAfHps89pZBglqEg3c,6521
|
|
380
|
+
unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py,sha256=I1MJwe5LOxoPLjwo00H0XbXO6u_SJHWYgsj4s6ePoyI,2754
|
|
381
|
+
unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py,sha256=P4rfcE3td7WyuuguRgUnGQytCMDpfeYrrpshBZuVynY,3539
|
|
382
|
+
unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=UUotY_-HpgSEJkvdQfZTlbxY7CRLZ4ctL8TlryeFvxk,2790
|
|
383
|
+
unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py,sha256=Wk7s2_u5G0BOV5slvGc8IlUf7ivznY9PrgPqe6nlJKM,2897
|
|
334
384
|
unstructured_ingest/v2/processes/connectors/fsspec/__init__.py,sha256=TtdeImM7Ypl_n6sl7I1JqX6bGSG0t_FqvCqE3Cy24og,1846
|
|
335
385
|
unstructured_ingest/v2/processes/connectors/fsspec/azure.py,sha256=Y01BuVRql0Kvzc_cdaZE9dDGYjJzrwJu-etfUrEGcUU,7061
|
|
336
386
|
unstructured_ingest/v2/processes/connectors/fsspec/box.py,sha256=Cjk0LUxqOCDbme0GmnD_5_b1hfStjI23cKw6BquKNrg,5488
|
|
337
387
|
unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py,sha256=NNAxIRdOQxUncfwhu7J7SnQRM6BSStNOyQZi-4E51iY,5816
|
|
338
|
-
unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=
|
|
388
|
+
unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=eFcrpSAB8wbLHuCiDb-2QpEUtgEEUA_iSqcT81H2-3Q,11472
|
|
339
389
|
unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=-_pYHbsBG9FyRyNIaf_xyFbPiiR7pnWEEg_8mp0rIZ8,7053
|
|
340
390
|
unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=je1BDqFWlyMfPa4oAMMNFQLLQtCY9quuqx3xjTwF8OQ,6251
|
|
341
391
|
unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=dwpyqDq0qceCBWX3zM1hiUlgXB4hzX6ObOr-sh-5CJs,6926
|
|
342
392
|
unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
|
|
343
|
-
unstructured_ingest
|
|
344
|
-
unstructured_ingest
|
|
345
|
-
unstructured_ingest
|
|
346
|
-
unstructured_ingest
|
|
347
|
-
unstructured_ingest-0.
|
|
348
|
-
unstructured_ingest-0.
|
|
393
|
+
unstructured_ingest/v2/processes/connectors/sql/__init__.py,sha256=tr3SZH0tz04XSxqGRkUu__tL_0zn0bSms2jILE-3Rug,543
|
|
394
|
+
unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=oMwfYCycX-jTSKW-c6o6K09aU74Wn1B_G3Ib20oYi1A,6050
|
|
395
|
+
unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=MbSvYSjhgGj8HHI7P-gH5bQ0Lqxtf8BEFsKNmCUfzug,9807
|
|
396
|
+
unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=LxC2Q_rPHytbTDflmWzj4H5Jx-41phKnfp6FCpDe-UY,5701
|
|
397
|
+
unstructured_ingest-0.1.1.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
398
|
+
unstructured_ingest-0.1.1.dist-info/METADATA,sha256=LQ_M1kX7q7rGBvslwml9KbrJGJHAaA_SLWM64BBaZrg,7188
|
|
399
|
+
unstructured_ingest-0.1.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
|
400
|
+
unstructured_ingest-0.1.1.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
401
|
+
unstructured_ingest-0.1.1.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
|
|
402
|
+
unstructured_ingest-0.1.1.dist-info/RECORD,,
|
|
@@ -1,275 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import uuid
|
|
3
|
-
from dataclasses import dataclass, field
|
|
4
|
-
from datetime import date, datetime
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union
|
|
7
|
-
|
|
8
|
-
import numpy as np
|
|
9
|
-
import pandas as pd
|
|
10
|
-
from dateutil import parser
|
|
11
|
-
from pydantic import Field, Secret
|
|
12
|
-
|
|
13
|
-
from unstructured_ingest.error import DestinationConnectionError
|
|
14
|
-
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
15
|
-
from unstructured_ingest.v2.interfaces import (
|
|
16
|
-
AccessConfig,
|
|
17
|
-
ConnectionConfig,
|
|
18
|
-
FileData,
|
|
19
|
-
Uploader,
|
|
20
|
-
UploaderConfig,
|
|
21
|
-
UploadStager,
|
|
22
|
-
UploadStagerConfig,
|
|
23
|
-
)
|
|
24
|
-
from unstructured_ingest.v2.logger import logger
|
|
25
|
-
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
26
|
-
|
|
27
|
-
if TYPE_CHECKING:
|
|
28
|
-
from sqlite3 import Connection as SqliteConnection
|
|
29
|
-
|
|
30
|
-
from psycopg2.extensions import connection as PostgresConnection
|
|
31
|
-
|
|
32
|
-
CONNECTOR_TYPE = "sql"
|
|
33
|
-
ELEMENTS_TABLE_NAME = "elements"
|
|
34
|
-
SQLITE_DB = "sqlite"
|
|
35
|
-
POSTGRESQL_DB = "postgresql"
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
class SQLAccessConfig(AccessConfig):
|
|
39
|
-
username: Optional[str] = Field(default=None, description="DB username")
|
|
40
|
-
password: Optional[str] = Field(default=None, description="DB password")
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
class SQLConnectionConfig(ConnectionConfig):
|
|
44
|
-
db_type: Literal["sqlite", "postgresql"] = Field(
|
|
45
|
-
default=SQLITE_DB, description="Type of the database backend"
|
|
46
|
-
)
|
|
47
|
-
database: Optional[str] = Field(
|
|
48
|
-
default=None,
|
|
49
|
-
description="Database name. For sqlite databases, this is the path to the .db file.",
|
|
50
|
-
)
|
|
51
|
-
host: Optional[str] = Field(default=None, description="DB host")
|
|
52
|
-
port: Optional[int] = Field(default=5432, description="DB host connection port")
|
|
53
|
-
access_config: Secret[SQLAccessConfig] = Field(default=SQLAccessConfig(), validate_default=True)
|
|
54
|
-
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
55
|
-
|
|
56
|
-
def __post_init__(self):
|
|
57
|
-
if (self.db_type == SQLITE_DB) and (self.database is None):
|
|
58
|
-
raise ValueError(
|
|
59
|
-
"A sqlite connection requires a path to a *.db file "
|
|
60
|
-
"through the `database` argument"
|
|
61
|
-
)
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
class SQLUploadStagerConfig(UploadStagerConfig):
|
|
65
|
-
pass
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
_COLUMNS = (
|
|
69
|
-
"id",
|
|
70
|
-
"element_id",
|
|
71
|
-
"text",
|
|
72
|
-
"embeddings",
|
|
73
|
-
"type",
|
|
74
|
-
"system",
|
|
75
|
-
"layout_width",
|
|
76
|
-
"layout_height",
|
|
77
|
-
"points",
|
|
78
|
-
"url",
|
|
79
|
-
"version",
|
|
80
|
-
"date_created",
|
|
81
|
-
"date_modified",
|
|
82
|
-
"date_processed",
|
|
83
|
-
"permissions_data",
|
|
84
|
-
"record_locator",
|
|
85
|
-
"category_depth",
|
|
86
|
-
"parent_id",
|
|
87
|
-
"attached_filename",
|
|
88
|
-
"filetype",
|
|
89
|
-
"last_modified",
|
|
90
|
-
"file_directory",
|
|
91
|
-
"filename",
|
|
92
|
-
"languages",
|
|
93
|
-
"page_number",
|
|
94
|
-
"links",
|
|
95
|
-
"page_name",
|
|
96
|
-
"link_urls",
|
|
97
|
-
"link_texts",
|
|
98
|
-
"sent_from",
|
|
99
|
-
"sent_to",
|
|
100
|
-
"subject",
|
|
101
|
-
"section",
|
|
102
|
-
"header_footer_type",
|
|
103
|
-
"emphasized_text_contents",
|
|
104
|
-
"emphasized_text_tags",
|
|
105
|
-
"text_as_html",
|
|
106
|
-
"regex_metadata",
|
|
107
|
-
"detection_class_prob",
|
|
108
|
-
)
|
|
109
|
-
|
|
110
|
-
_DATE_COLUMNS = ("date_created", "date_modified", "date_processed", "last_modified")
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
def parse_date_string(date_value: Union[str, int]) -> date:
|
|
114
|
-
try:
|
|
115
|
-
timestamp = float(date_value) / 1000 if isinstance(date_value, int) else float(date_value)
|
|
116
|
-
return datetime.fromtimestamp(timestamp)
|
|
117
|
-
except Exception as e:
|
|
118
|
-
logger.debug(f"date {date_value} string not a timestamp: {e}")
|
|
119
|
-
return parser.parse(date_value)
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
@dataclass
|
|
123
|
-
class SQLUploadStager(UploadStager):
|
|
124
|
-
upload_stager_config: SQLUploadStagerConfig = field(
|
|
125
|
-
default_factory=lambda: SQLUploadStagerConfig()
|
|
126
|
-
)
|
|
127
|
-
|
|
128
|
-
def run(
|
|
129
|
-
self,
|
|
130
|
-
elements_filepath: Path,
|
|
131
|
-
file_data: FileData,
|
|
132
|
-
output_dir: Path,
|
|
133
|
-
output_filename: str,
|
|
134
|
-
**kwargs: Any,
|
|
135
|
-
) -> Path:
|
|
136
|
-
with open(elements_filepath) as elements_file:
|
|
137
|
-
elements_contents: list[dict] = json.load(elements_file)
|
|
138
|
-
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
139
|
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
140
|
-
|
|
141
|
-
output = []
|
|
142
|
-
for data in elements_contents:
|
|
143
|
-
metadata: dict[str, Any] = data.pop("metadata", {})
|
|
144
|
-
data_source = metadata.pop("data_source", {})
|
|
145
|
-
coordinates = metadata.pop("coordinates", {})
|
|
146
|
-
|
|
147
|
-
data.update(metadata)
|
|
148
|
-
data.update(data_source)
|
|
149
|
-
data.update(coordinates)
|
|
150
|
-
|
|
151
|
-
data["id"] = str(uuid.uuid4())
|
|
152
|
-
|
|
153
|
-
# remove extraneous, not supported columns
|
|
154
|
-
data = {k: v for k, v in data.items() if k in _COLUMNS}
|
|
155
|
-
|
|
156
|
-
output.append(data)
|
|
157
|
-
|
|
158
|
-
df = pd.DataFrame.from_dict(output)
|
|
159
|
-
for column in filter(lambda x: x in df.columns, _DATE_COLUMNS):
|
|
160
|
-
df[column] = df[column].apply(parse_date_string)
|
|
161
|
-
for column in filter(
|
|
162
|
-
lambda x: x in df.columns,
|
|
163
|
-
("permissions_data", "record_locator", "points", "links"),
|
|
164
|
-
):
|
|
165
|
-
df[column] = df[column].apply(
|
|
166
|
-
lambda x: json.dumps(x) if isinstance(x, (list, dict)) else None
|
|
167
|
-
)
|
|
168
|
-
for column in filter(
|
|
169
|
-
lambda x: x in df.columns,
|
|
170
|
-
("version", "page_number", "regex_metadata"),
|
|
171
|
-
):
|
|
172
|
-
df[column] = df[column].apply(str)
|
|
173
|
-
|
|
174
|
-
with output_path.open("w") as output_file:
|
|
175
|
-
df.to_json(output_file, orient="records", lines=True)
|
|
176
|
-
return output_path
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
class SQLUploaderConfig(UploaderConfig):
|
|
180
|
-
batch_size: int = Field(default=50, description="Number of records per batch")
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
@dataclass
|
|
184
|
-
class SQLUploader(Uploader):
|
|
185
|
-
connector_type: str = CONNECTOR_TYPE
|
|
186
|
-
upload_config: SQLUploaderConfig
|
|
187
|
-
connection_config: SQLConnectionConfig
|
|
188
|
-
|
|
189
|
-
def precheck(self) -> None:
|
|
190
|
-
try:
|
|
191
|
-
cursor = self.connection().cursor()
|
|
192
|
-
cursor.execute("SELECT 1;")
|
|
193
|
-
cursor.close()
|
|
194
|
-
except Exception as e:
|
|
195
|
-
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
196
|
-
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
197
|
-
|
|
198
|
-
@property
|
|
199
|
-
def connection(self) -> Callable[[], Union["SqliteConnection", "PostgresConnection"]]:
|
|
200
|
-
if self.connection_config.db_type == POSTGRESQL_DB:
|
|
201
|
-
return self._make_psycopg_connection
|
|
202
|
-
elif self.connection_config.db_type == SQLITE_DB:
|
|
203
|
-
return self._make_sqlite_connection
|
|
204
|
-
raise ValueError(f"Unsupported database {self.connection_config.db_type} connection.")
|
|
205
|
-
|
|
206
|
-
def _make_sqlite_connection(self) -> "SqliteConnection":
|
|
207
|
-
from sqlite3 import connect
|
|
208
|
-
|
|
209
|
-
return connect(database=self.connection_config.database)
|
|
210
|
-
|
|
211
|
-
@requires_dependencies(["psycopg2"], extras="postgres")
|
|
212
|
-
def _make_psycopg_connection(self) -> "PostgresConnection":
|
|
213
|
-
from psycopg2 import connect
|
|
214
|
-
|
|
215
|
-
access_config = self.connection_config.access_config.get_secret_value()
|
|
216
|
-
return connect(
|
|
217
|
-
user=access_config.username,
|
|
218
|
-
password=access_config.password,
|
|
219
|
-
dbname=self.connection_config.database,
|
|
220
|
-
host=self.connection_config.host,
|
|
221
|
-
port=self.connection_config.port,
|
|
222
|
-
)
|
|
223
|
-
|
|
224
|
-
def prepare_data(
|
|
225
|
-
self, columns: list[str], data: tuple[tuple[Any, ...], ...]
|
|
226
|
-
) -> list[tuple[Any, ...]]:
|
|
227
|
-
output = []
|
|
228
|
-
for row in data:
|
|
229
|
-
parsed = []
|
|
230
|
-
for column_name, value in zip(columns, row):
|
|
231
|
-
if self.connection_config.db_type == SQLITE_DB and isinstance(value, (list, dict)):
|
|
232
|
-
value = json.dumps(value)
|
|
233
|
-
if column_name in _DATE_COLUMNS:
|
|
234
|
-
if value is None:
|
|
235
|
-
parsed.append(None)
|
|
236
|
-
else:
|
|
237
|
-
parsed.append(parse_date_string(value))
|
|
238
|
-
else:
|
|
239
|
-
parsed.append(value)
|
|
240
|
-
output.append(tuple(parsed))
|
|
241
|
-
return output
|
|
242
|
-
|
|
243
|
-
def upload_contents(self, path: Path) -> None:
|
|
244
|
-
df = pd.read_json(path, orient="records", lines=True)
|
|
245
|
-
logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
|
|
246
|
-
df.replace({np.nan: None}, inplace=True)
|
|
247
|
-
|
|
248
|
-
columns = tuple(df.columns)
|
|
249
|
-
stmt = f"INSERT INTO {ELEMENTS_TABLE_NAME} ({','.join(columns)}) \
|
|
250
|
-
VALUES({','.join(['?' if self.connection_config.db_type==SQLITE_DB else '%s' for x in columns])})" # noqa E501
|
|
251
|
-
|
|
252
|
-
for rows in pd.read_json(
|
|
253
|
-
path, orient="records", lines=True, chunksize=self.upload_config.batch_size
|
|
254
|
-
):
|
|
255
|
-
with self.connection() as conn:
|
|
256
|
-
values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
|
|
257
|
-
if self.connection_config.db_type == SQLITE_DB:
|
|
258
|
-
conn.executemany(stmt, values)
|
|
259
|
-
else:
|
|
260
|
-
with conn.cursor() as cur:
|
|
261
|
-
cur.executemany(stmt, values)
|
|
262
|
-
|
|
263
|
-
conn.commit()
|
|
264
|
-
|
|
265
|
-
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
266
|
-
self.upload_contents(path=path)
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
sql_destination_entry = DestinationRegistryEntry(
|
|
270
|
-
connection_config=SQLConnectionConfig,
|
|
271
|
-
uploader=SQLUploader,
|
|
272
|
-
uploader_config=SQLUploaderConfig,
|
|
273
|
-
upload_stager=SQLUploadStager,
|
|
274
|
-
upload_stager_config=SQLUploadStagerConfig,
|
|
275
|
-
)
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/entry_points.txt
RENAMED
|
File without changes
|