unstructured-ingest 0.0.24__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (87) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +42 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +15 -0
  7. test/integration/connectors/databricks_tests/__init__.py +0 -0
  8. test/integration/connectors/databricks_tests/test_volumes_native.py +165 -0
  9. test/integration/connectors/test_postgres.py +100 -0
  10. test/integration/connectors/test_s3.py +152 -0
  11. test/integration/connectors/test_sqlite.py +91 -0
  12. test/integration/connectors/utils/__init__.py +0 -0
  13. test/integration/connectors/utils/constants.py +7 -0
  14. test/integration/connectors/utils/docker_compose.py +44 -0
  15. test/integration/connectors/utils/validation.py +198 -0
  16. test/integration/embedders/__init__.py +0 -0
  17. test/integration/embedders/conftest.py +13 -0
  18. test/integration/embedders/test_bedrock.py +49 -0
  19. test/integration/embedders/test_huggingface.py +26 -0
  20. test/integration/embedders/test_mixedbread.py +47 -0
  21. test/integration/embedders/test_octoai.py +41 -0
  22. test/integration/embedders/test_openai.py +41 -0
  23. test/integration/embedders/test_vertexai.py +41 -0
  24. test/integration/embedders/test_voyageai.py +41 -0
  25. test/integration/embedders/togetherai.py +43 -0
  26. test/integration/embedders/utils.py +44 -0
  27. test/integration/partitioners/__init__.py +0 -0
  28. test/integration/partitioners/test_partitioner.py +75 -0
  29. test/integration/utils.py +15 -0
  30. test/unit/__init__.py +0 -0
  31. test/unit/embed/__init__.py +0 -0
  32. test/unit/embed/test_mixedbreadai.py +41 -0
  33. test/unit/embed/test_octoai.py +20 -0
  34. test/unit/embed/test_openai.py +20 -0
  35. test/unit/embed/test_vertexai.py +25 -0
  36. test/unit/embed/test_voyageai.py +24 -0
  37. test/unit/test_chunking_utils.py +36 -0
  38. test/unit/test_error.py +27 -0
  39. test/unit/test_interfaces.py +280 -0
  40. test/unit/test_interfaces_v2.py +26 -0
  41. test/unit/test_logger.py +78 -0
  42. test/unit/test_utils.py +164 -0
  43. test/unit/test_utils_v2.py +82 -0
  44. unstructured_ingest/__version__.py +1 -1
  45. unstructured_ingest/cli/interfaces.py +2 -2
  46. unstructured_ingest/connector/notion/types/block.py +1 -0
  47. unstructured_ingest/connector/notion/types/database.py +1 -0
  48. unstructured_ingest/connector/notion/types/page.py +1 -0
  49. unstructured_ingest/embed/bedrock.py +0 -20
  50. unstructured_ingest/embed/huggingface.py +0 -21
  51. unstructured_ingest/embed/interfaces.py +29 -3
  52. unstructured_ingest/embed/mixedbreadai.py +0 -36
  53. unstructured_ingest/embed/octoai.py +2 -24
  54. unstructured_ingest/embed/openai.py +0 -20
  55. unstructured_ingest/embed/togetherai.py +40 -0
  56. unstructured_ingest/embed/vertexai.py +0 -20
  57. unstructured_ingest/embed/voyageai.py +1 -24
  58. unstructured_ingest/interfaces.py +1 -1
  59. unstructured_ingest/utils/dep_check.py +12 -0
  60. unstructured_ingest/v2/cli/utils/click.py +21 -2
  61. unstructured_ingest/v2/interfaces/connector.py +22 -2
  62. unstructured_ingest/v2/interfaces/downloader.py +1 -0
  63. unstructured_ingest/v2/processes/chunker.py +1 -1
  64. unstructured_ingest/v2/processes/connectors/__init__.py +9 -11
  65. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  66. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +175 -0
  67. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  68. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  69. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  70. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  71. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +125 -32
  72. unstructured_ingest/v2/processes/connectors/mongodb.py +223 -3
  73. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  74. unstructured_ingest/v2/processes/connectors/pinecone.py +9 -1
  75. unstructured_ingest/v2/processes/connectors/sql/__init__.py +13 -0
  76. unstructured_ingest/v2/processes/connectors/sql/postgres.py +121 -0
  77. unstructured_ingest/v2/processes/connectors/sql/sql.py +181 -0
  78. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +109 -0
  79. unstructured_ingest/v2/processes/embedder.py +13 -0
  80. unstructured_ingest/v2/processes/partitioner.py +2 -1
  81. {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/METADATA +12 -10
  82. {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/RECORD +86 -32
  83. {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/top_level.txt +1 -0
  84. unstructured_ingest/v2/processes/connectors/sql.py +0 -275
  85. {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/LICENSE.md +0 -0
  86. {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/WHEEL +0 -0
  87. {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/entry_points.txt +0 -0
@@ -22,6 +22,7 @@ class EmbedderConfig(BaseModel):
22
22
  "voyageai",
23
23
  "octoai",
24
24
  "mixedbread-ai",
25
+ "togetherai",
25
26
  ]
26
27
  ] = Field(default=None, description="Type of the embedding class to be used.")
27
28
  embedding_api_key: Optional[SecretStr] = Field(
@@ -107,6 +108,16 @@ class EmbedderConfig(BaseModel):
107
108
  config=MixedbreadAIEmbeddingConfig.model_validate(embedding_kwargs)
108
109
  )
109
110
 
111
+ def get_togetherai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
112
+ from unstructured_ingest.embed.togetherai import (
113
+ TogetherAIEmbeddingConfig,
114
+ TogetherAIEmbeddingEncoder,
115
+ )
116
+
117
+ return TogetherAIEmbeddingEncoder(
118
+ config=TogetherAIEmbeddingConfig.model_validate(embedding_kwargs)
119
+ )
120
+
110
121
  def get_embedder(self) -> "BaseEmbeddingEncoder":
111
122
  kwargs: dict[str, Any] = {}
112
123
  if self.embedding_api_key:
@@ -133,6 +144,8 @@ class EmbedderConfig(BaseModel):
133
144
  return self.get_voyageai_embedder(embedding_kwargs=kwargs)
134
145
  if self.embedding_provider == "mixedbread-ai":
135
146
  return self.get_mixedbread_embedder(embedding_kwargs=kwargs)
147
+ if self.embedding_provider == "togetherai":
148
+ return self.get_togetherai_embedder(embedding_kwargs=kwargs)
136
149
 
137
150
  raise ValueError(f"{self.embedding_provider} not a recognized encoder")
138
151
 
@@ -55,7 +55,7 @@ class PartitionerConfig(BaseModel):
55
55
  "fields if they exist and drop all other fields. ",
56
56
  )
57
57
  partition_endpoint: Optional[str] = Field(
58
- default="https://api.unstructured.io/general/v0/general",
58
+ default="https://api.unstructuredapp.io/general/v0/general",
59
59
  description="If partitioning via api, use the following host.",
60
60
  )
61
61
  partition_by_api: bool = Field(
@@ -153,6 +153,7 @@ class Partitioner(BaseProcess, ABC):
153
153
  async def partition_via_api(
154
154
  self, filename: Path, metadata: Optional[dict] = None, **kwargs
155
155
  ) -> list[dict]:
156
+ metadata = metadata or {}
156
157
  logger.debug(f"partitioning file {filename} with metadata: {metadata}")
157
158
 
158
159
  elements = await call_api(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.0.24
3
+ Version: 0.1.0
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -23,12 +23,12 @@ Requires-Python: >=3.9.0,<3.13
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
25
  Requires-Dist: pydantic>=2.7
26
- Requires-Dist: tqdm
27
- Requires-Dist: click
28
- Requires-Dist: python-dateutil
29
26
  Requires-Dist: opentelemetry-sdk
27
+ Requires-Dist: tqdm
30
28
  Requires-Dist: pandas
29
+ Requires-Dist: python-dateutil
31
30
  Requires-Dist: dataclasses-json
31
+ Requires-Dist: click
32
32
  Provides-Extra: airtable
33
33
  Requires-Dist: pyairtable; extra == "airtable"
34
34
  Provides-Extra: astradb
@@ -44,8 +44,8 @@ Provides-Extra: biomed
44
44
  Requires-Dist: bs4; extra == "biomed"
45
45
  Requires-Dist: requests; extra == "biomed"
46
46
  Provides-Extra: box
47
- Requires-Dist: boxfs; extra == "box"
48
47
  Requires-Dist: fsspec; extra == "box"
48
+ Requires-Dist: boxfs; extra == "box"
49
49
  Provides-Extra: chroma
50
50
  Requires-Dist: chromadb; extra == "chroma"
51
51
  Provides-Extra: clarifai
@@ -87,12 +87,12 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
87
87
  Provides-Extra: epub
88
88
  Requires-Dist: unstructured[epub]; extra == "epub"
89
89
  Provides-Extra: gcs
90
+ Requires-Dist: bs4; extra == "gcs"
90
91
  Requires-Dist: gcsfs; extra == "gcs"
91
92
  Requires-Dist: fsspec; extra == "gcs"
92
- Requires-Dist: bs4; extra == "gcs"
93
93
  Provides-Extra: github
94
- Requires-Dist: pygithub>1.58.0; extra == "github"
95
94
  Requires-Dist: requests; extra == "github"
95
+ Requires-Dist: pygithub>1.58.0; extra == "github"
96
96
  Provides-Extra: gitlab
97
97
  Requires-Dist: python-gitlab; extra == "gitlab"
98
98
  Provides-Extra: google-drive
@@ -116,15 +116,15 @@ Provides-Extra: msg
116
116
  Requires-Dist: unstructured[msg]; extra == "msg"
117
117
  Provides-Extra: notion
118
118
  Requires-Dist: notion-client; extra == "notion"
119
+ Requires-Dist: httpx; extra == "notion"
119
120
  Requires-Dist: backoff; extra == "notion"
120
121
  Requires-Dist: htmlBuilder; extra == "notion"
121
- Requires-Dist: httpx; extra == "notion"
122
122
  Provides-Extra: odt
123
123
  Requires-Dist: unstructured[odt]; extra == "odt"
124
124
  Provides-Extra: onedrive
125
+ Requires-Dist: bs4; extra == "onedrive"
125
126
  Requires-Dist: msal; extra == "onedrive"
126
127
  Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
127
- Requires-Dist: bs4; extra == "onedrive"
128
128
  Provides-Extra: openai
129
129
  Requires-Dist: openai; extra == "openai"
130
130
  Requires-Dist: tiktoken; extra == "openai"
@@ -161,8 +161,8 @@ Requires-Dist: fsspec; extra == "s3"
161
161
  Provides-Extra: salesforce
162
162
  Requires-Dist: simple-salesforce; extra == "salesforce"
163
163
  Provides-Extra: sftp
164
- Requires-Dist: paramiko; extra == "sftp"
165
164
  Requires-Dist: fsspec; extra == "sftp"
165
+ Requires-Dist: paramiko; extra == "sftp"
166
166
  Provides-Extra: sharepoint
167
167
  Requires-Dist: msal; extra == "sharepoint"
168
168
  Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
@@ -170,6 +170,8 @@ Provides-Extra: singlestore
170
170
  Requires-Dist: singlestoredb; extra == "singlestore"
171
171
  Provides-Extra: slack
172
172
  Requires-Dist: slack-sdk; extra == "slack"
173
+ Provides-Extra: togetherai
174
+ Requires-Dist: together; extra == "togetherai"
173
175
  Provides-Extra: tsv
174
176
  Requires-Dist: unstructured[tsv]; extra == "tsv"
175
177
  Provides-Extra: vectara
@@ -1,7 +1,50 @@
1
+ test/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ test/integration/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ test/integration/utils.py,sha256=CWqzEGw6TA_ZoP9hRUkW64TWYssooBbufcTRmbJvod8,401
4
+ test/integration/chunkers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ test/integration/chunkers/test_chunkers.py,sha256=pqn1Rqh36jZTJL4qpU0iuOMFAEQ-LrKAPOgWtQMAt_I,1482
6
+ test/integration/connectors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ test/integration/connectors/conftest.py,sha256=Q8ScDzrzO2o-8D_kYFt8LL7QAhoFTRRtKJKMc2hLMcI,345
8
+ test/integration/connectors/test_postgres.py,sha256=9uaqlUmLpVF09cwKSw7Yldq2kjU00WBedbEIgyJG5Cw,3998
9
+ test/integration/connectors/test_s3.py,sha256=fK0soCTkNxp-4hm4O2LPrhlZXvYmaeTmeEgeNh1b0k8,5839
10
+ test/integration/connectors/test_sqlite.py,sha256=NnLdyt3FfM1A53tXPJbgIcsy-iEgYY8OZYOfliFqifM,3507
11
+ test/integration/connectors/databricks_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ test/integration/connectors/databricks_tests/test_volumes_native.py,sha256=kS45mnNu9_U4qV3cxByEFXCYLEBWRy-fxxhzR3r93cs,5685
13
+ test/integration/connectors/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ test/integration/connectors/utils/constants.py,sha256=OjxLmmzCbDNqH5tK0jWFxDgIkM973cr3SmFIRk7aySc,222
15
+ test/integration/connectors/utils/docker_compose.py,sha256=6XeYOKQFZCBRLEmcgH2mmBAaVs6R6jCWAhJLjq6p-aM,1771
16
+ test/integration/connectors/utils/validation.py,sha256=VNvyutfnWbnesavL_V5SjM2H3LoOHnkW7Paq8RO4WbM,8199
17
+ test/integration/embedders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
+ test/integration/embedders/conftest.py,sha256=B2W771RbijR7G_GybsCzRyIvOzXqzbKZdRIlNDd5AGY,334
19
+ test/integration/embedders/test_bedrock.py,sha256=0oBRNS_DtFDGQ22Z1T3t6VOJ31PrItgvnJpqcLe9Fg4,1903
20
+ test/integration/embedders/test_huggingface.py,sha256=0mMTOO-Nh7KB70AGs_7LLQIxMYrnSPqyihriUeqACbM,1007
21
+ test/integration/embedders/test_mixedbread.py,sha256=RrLv8SByMNXsgrlh94RbaT-VyxZ4-DILO-OPpmOwvSI,1441
22
+ test/integration/embedders/test_octoai.py,sha256=LnR0BLttamW5PGid6jFxATDAi0x7hq5iWMXurbHP6TI,1328
23
+ test/integration/embedders/test_openai.py,sha256=0jlFqEeeCneIWX9tGyC3TXeUNqsMXR7u5n7uEIaAQKo,1328
24
+ test/integration/embedders/test_vertexai.py,sha256=OtoFzmrWWhGIO5Bbl5zt_4sp6qRHZxtaDQKpGcfzNLM,1345
25
+ test/integration/embedders/test_voyageai.py,sha256=Zqf7nn1AxfBDBr5A9Jr-5pxes4QNvfKiyeGexCCm4nY,1346
26
+ test/integration/embedders/togetherai.py,sha256=0W1ScD5yb1D9hPC2ewUsuCHLUOpCuM083YMBhqAI9fw,1395
27
+ test/integration/embedders/utils.py,sha256=3AMKMBpgBep_0jFqrqMHH8BJo6w60kpouSZ5JPJTwIA,1850
28
+ test/integration/partitioners/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
+ test/integration/partitioners/test_partitioner.py,sha256=KEpnhsz2YNAoQ2UZGOTsi1_uk1h4Vg-gGTsy5Fe9OCw,2846
30
+ test/unit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
+ test/unit/test_chunking_utils.py,sha256=0iPwfnMPpyTm-yOE0BXMnEQQP4iguS6NhOqgMQU5nhk,1390
32
+ test/unit/test_error.py,sha256=RflmngCdFNKOLXVfLnUdNfY3Mfg3k7DTEzfIl0B-syU,840
33
+ test/unit/test_interfaces.py,sha256=XNj8qasc1ltaeUv-2y31rv7R9xquo0rgRrMvBZoNZLw,9623
34
+ test/unit/test_interfaces_v2.py,sha256=nyxUsRX1M6Mfhux7SqEhal85PIaWO5xhm6ZTcqpPpHI,790
35
+ test/unit/test_logger.py,sha256=0SKndXE_VRd8XmUHkrj7zuBQHZscXx3ZQllMEOvtF9Y,2380
36
+ test/unit/test_utils.py,sha256=xJ9WGpHBihWpQWvIzd6z99UIdZJba8U7c31h3q6C9To,4800
37
+ test/unit/test_utils_v2.py,sha256=TWVAeE0OrcHgPyzGPtEnQakICsVrDeVhIKPMRQPX554,2638
38
+ test/unit/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
+ test/unit/embed/test_mixedbreadai.py,sha256=XFNJDP5pIgF3eQYwBiuEWmH3zZWx72Wpwyv-Q4m0DJg,1332
40
+ test/unit/embed/test_octoai.py,sha256=Ha9EgAW64Q45hFj51tToe8RyKXWXwqAkdDqSFDMu37Q,831
41
+ test/unit/embed/test_openai.py,sha256=0O1yshDcE0BMKv1yJqrNuiNLSdPhLpKqJ-D_wmnidsM,831
42
+ test/unit/embed/test_vertexai.py,sha256=Pl7COc9E3tf_yGidkTEmTizNGyZF1F5zuL2TgPTMnfI,1048
43
+ test/unit/embed/test_voyageai.py,sha256=DviCOJFhe5H4e26-kNyX3JNe8h3qB5Yl0KOe8rQEMrc,981
1
44
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
2
- unstructured_ingest/__version__.py,sha256=i77-gjXpw3EQpetJm6qwuhTR53KoBsdCYSBjHDaGJUQ,43
45
+ unstructured_ingest/__version__.py,sha256=J87Ao0q5WoHKbDEbH6O10GOGaMO3yEUCBOxCqbm715I,42
3
46
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
4
- unstructured_ingest/interfaces.py,sha256=0r0gQoHJQ4DVSQEVbUPBA3N6WyvGMkR1u6U2SwUvoAQ,31361
47
+ unstructured_ingest/interfaces.py,sha256=m03BgenxSA34HbW157L7V9TGxK_dTG7N2AnAhF31W-U,31364
5
48
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
6
49
  unstructured_ingest/main.py,sha256=82G_7eG4PNhc_xIqj4Y_sFbDV9VI-nwSfsfJQMzovMk,169
7
50
  unstructured_ingest/processor.py,sha256=XKKrvbxsb--5cDzz4hB3-GfWZYyIjJ2ah8FpzQKF_DM,2760
@@ -9,7 +52,7 @@ unstructured_ingest/cli/__init__.py,sha256=9kNcBOHuXON5lB1MJU9QewEhwPmId56vXqB29
9
52
  unstructured_ingest/cli/cli.py,sha256=LutBTBYMqboKw8cputHVszpenyfnySzcUC15ifwuYyg,1049
10
53
  unstructured_ingest/cli/cmd_factory.py,sha256=UdHm1KacTombpF6DxyTSwTCuApsKHUYw_kVu5Nhcy3Y,364
11
54
  unstructured_ingest/cli/common.py,sha256=I0El08FHz5kxw7iz0VWOWPrvcJD1rBgXJSwVIpVmmwU,204
12
- unstructured_ingest/cli/interfaces.py,sha256=nWZVXAoLEP08eDPj10c2nwHNbd-HXOHFa4YvEdUJ8y8,24084
55
+ unstructured_ingest/cli/interfaces.py,sha256=lpaaOdAQ4NMsawVaHSk5lXCcZ0Mw85kRzfElu1ODCB0,24090
13
56
  unstructured_ingest/cli/utils.py,sha256=KNhkFNKOeEihc8HlvMz_MTbYVQNFklrBKbC8xg9h1xE,7982
14
57
  unstructured_ingest/cli/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
58
  unstructured_ingest/cli/base/cmd.py,sha256=BbfjA2v203Jh-7DL6bzxQ7fOeNixd5BsBMuzXz6h5IQ,583
@@ -106,11 +149,11 @@ unstructured_ingest/connector/notion/connector.py,sha256=8A9d-Pej-uXzjEy85zUloxI
106
149
  unstructured_ingest/connector/notion/helpers.py,sha256=-eEB8eSqdD5bWX_QEA2hZz1siucC0FNEUEqCEJptiVk,20702
107
150
  unstructured_ingest/connector/notion/interfaces.py,sha256=SrTT-9c0nvk0fMqVgudYF647r04AdMKi6wkIkMy7Szw,563
108
151
  unstructured_ingest/connector/notion/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
109
- unstructured_ingest/connector/notion/types/block.py,sha256=AKOY-o6CTFC-caWlkLfKskMuFemH4-Vdrhv7HnRkS8w,3009
110
- unstructured_ingest/connector/notion/types/database.py,sha256=UvrxuCd62wIYtgDKSkyGEBZHwfqvPKq1U3qr3w-zLAI,2551
152
+ unstructured_ingest/connector/notion/types/block.py,sha256=w3j3F_z-50dpIpBt4Ib8_U4eINZRFMfGHdbE3hjkbu0,3028
153
+ unstructured_ingest/connector/notion/types/database.py,sha256=1SrP5sxWhif8dxCteXnJAFX2HwoXI2EJy9IRIzM_nGM,2570
111
154
  unstructured_ingest/connector/notion/types/date.py,sha256=Ah0ekF18S_9xVDT2Ps1NGD1eOihtInGIYji_BDIalig,729
112
155
  unstructured_ingest/connector/notion/types/file.py,sha256=xc5UQ46qWvVd3SkKJctRBqMVERCNc_UVVc21pu66IME,1291
113
- unstructured_ingest/connector/notion/types/page.py,sha256=PR3xT7OdO31zHDpp3bhgc5GLdcFuk8F6jOqGlOu5xNg,1408
156
+ unstructured_ingest/connector/notion/types/page.py,sha256=oZS5ausaiA68Ux-i6mOA0qYywP0X7YchXL8gWscMaxQ,1427
114
157
  unstructured_ingest/connector/notion/types/parent.py,sha256=VTNyL5JNVLb5AqR5P-c658DC9bUgkRKPA9fI2CFZWoU,1695
115
158
  unstructured_ingest/connector/notion/types/rich_text.py,sha256=V0fqXLAq7H5A6Av0IM8TqqhqW45VWD8K79sHdh1FyA8,5450
116
159
  unstructured_ingest/connector/notion/types/user.py,sha256=VK-XYFt2WdtEqm_LGnmE22ms7xw84ia3pSBhpmy2IKg,1800
@@ -167,14 +210,15 @@ unstructured_ingest/connector/notion/types/database_properties/unique_id.py,sha2
167
210
  unstructured_ingest/connector/notion/types/database_properties/url.py,sha256=iXQ2tVUm9UlKVtDA0NQiFIRJ5PHYW9wOaWt2vFfSVCg,862
168
211
  unstructured_ingest/connector/notion/types/database_properties/verification.py,sha256=J_DLjY-v2T6xDGMQ7FkI0YMKMA6SG6Y3yYW7qUD1hKA,2334
169
212
  unstructured_ingest/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
170
- unstructured_ingest/embed/bedrock.py,sha256=5-pKWwOEGHKOHa06wYuKOhvT8Xu72ke6nrpCnRtkAaU,3872
171
- unstructured_ingest/embed/huggingface.py,sha256=ku_JQr72KBG8n5b6KRkXIbeBGzdgLw_KKIEm1dFK3oM,2729
172
- unstructured_ingest/embed/interfaces.py,sha256=L5WimR69bmEvliIBlZ8wOCH_YDA9DWteCu6QEsKCV5I,1113
173
- unstructured_ingest/embed/mixedbreadai.py,sha256=NSrAt1_bjphTHLUnlzzWSBU25UBCZlpYaLdWSRSGyqs,5504
174
- unstructured_ingest/embed/octoai.py,sha256=0zxAUAMzodGkqMwqMkEvSfgWLNHtEnhdvUofvJDQD1A,2368
175
- unstructured_ingest/embed/openai.py,sha256=4Ee4A2rQ8OlSh_yiJSFmok_qqRDi1A3KyayB5YiPLFw,2058
176
- unstructured_ingest/embed/vertexai.py,sha256=cgyRyTm_dO_qyedwbIhOQIFvKjCqZBoDh606ykzTYHI,3598
177
- unstructured_ingest/embed/voyageai.py,sha256=6BWNJUZOqkHSMaO2XPVZVYAVRrAtpMWQZEKp0qgp20Q,2631
213
+ unstructured_ingest/embed/bedrock.py,sha256=-PRdZsF44vwi6G4G75gdO31AJKfZWClOXkJQAk7rEO8,3096
214
+ unstructured_ingest/embed/huggingface.py,sha256=2cBiQhOhfWHX3hS-eKjocysOkUaRlyRfUj9Kxjrp6cE,1934
215
+ unstructured_ingest/embed/interfaces.py,sha256=au4Xp8ciDvo4bidlUbazFW2aC7NZW5-UDLKXBFVzAX4,2025
216
+ unstructured_ingest/embed/mixedbreadai.py,sha256=OwFWWukvkQaXhjgs6b6N6D4w7sYrtcHNhsHAj-Bocj4,4268
217
+ unstructured_ingest/embed/octoai.py,sha256=jHytDfQgup0v1PBcmlMv1nIh9Obg8WGO5qtLmN-Ot5g,1473
218
+ unstructured_ingest/embed/openai.py,sha256=JXo4boivNoo2lBzHuS4Z0FZ1zlgUGAPVt0X3HY540ZU,1282
219
+ unstructured_ingest/embed/togetherai.py,sha256=BL7NzExSE-laQqrp4ybUgoZ9JG_eop4hk-s2yCO_d5c,1451
220
+ unstructured_ingest/embed/vertexai.py,sha256=X5bGJdXyR5nAFH_ocAVgEowmd60nOBykyfclYo3VfBM,2808
221
+ unstructured_ingest/embed/voyageai.py,sha256=bjom9QqWmH1Mv08ewg8ZG7gO3rQPMVS0_ztm2KBAOjI,1821
178
222
  unstructured_ingest/enhanced_dataclass/__init__.py,sha256=gDZOUsv5eo-8jm4Yu7DdDwi101aGbfG7JctTdOYnTOM,151
179
223
  unstructured_ingest/enhanced_dataclass/core.py,sha256=d6aUkDynuKX87cHx9_N5UDUWrvISR4jYRFRTvd_avlI,3038
180
224
  unstructured_ingest/enhanced_dataclass/dataclasses.py,sha256=aZMsoCzAGRb8Rmh3BTSBFtNr6FmFTY93KYGLk3gYJKQ,1949
@@ -257,7 +301,7 @@ unstructured_ingest/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
257
301
  unstructured_ingest/utils/chunking.py,sha256=efWEfMcCukG5zASZrXhkNgAX8AzHa6t3rClMzm2TwFE,1521
258
302
  unstructured_ingest/utils/compression.py,sha256=NNiY-2S2Gf3at7zC1PYxMijaEza9vVSzRn5mdFf6mHo,4434
259
303
  unstructured_ingest/utils/data_prep.py,sha256=9UKewDHB8-cMlQ8POvokhjVsy-ksiSqAAW2ibqPYAfk,4400
260
- unstructured_ingest/utils/dep_check.py,sha256=cVEqZtMwji8BIt7pjtUOMtEmN7KaNXRXwelEKFpOdW8,1914
304
+ unstructured_ingest/utils/dep_check.py,sha256=SXXcUna2H0RtxA6j1S2NGkvQa9JP2DujWhmyBa7776Y,2400
261
305
  unstructured_ingest/utils/google_filetype.py,sha256=YVspEkiiBrRUSGVeVbsavvLvTmizdy2e6TsjigXTSRU,468
262
306
  unstructured_ingest/utils/string_and_date_utils.py,sha256=LwcbLmWpwt1zEabLlyUd5kIf9oOWcZxsRzxDglLCMeU,1375
263
307
  unstructured_ingest/utils/table.py,sha256=aWjcowDVSClNpEAdR6PY3H7khKu4T6T3QqQE6GjmQ_M,3469
@@ -276,11 +320,11 @@ unstructured_ingest/v2/cli/base/dest.py,sha256=zDjqek7anr0JQ2ptEl8KIAsUXuCuHRnBQ
276
320
  unstructured_ingest/v2/cli/base/importer.py,sha256=nRt0QQ3qpi264-n_mR0l55C2ddM8nowTNzT1jsWaam8,1128
277
321
  unstructured_ingest/v2/cli/base/src.py,sha256=cpQ43qQju4e5s_YSaPxUtA70BaisRkTBdjtlPhqn5Mg,2872
278
322
  unstructured_ingest/v2/cli/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
279
- unstructured_ingest/v2/cli/utils/click.py,sha256=Wn2s3PuvBCKB0lsK-W7X_Y0eYyWnS6Y9wWo1OhVBOzY,6344
323
+ unstructured_ingest/v2/cli/utils/click.py,sha256=HCEcdHf8Lck0zcx3kidKjLbHDHXIBxPRL2MGgtKtDlg,6967
280
324
  unstructured_ingest/v2/cli/utils/model_conversion.py,sha256=uJQKpbTC5ysOdVaRq2SWEjG8btBimVZYzX9NVL7xnzs,7500
281
325
  unstructured_ingest/v2/interfaces/__init__.py,sha256=Rfa8crx6De7WNOK-EjsWWwFVpsUfCc6gY8B8tQ3ae9I,899
282
- unstructured_ingest/v2/interfaces/connector.py,sha256=KG0pHdAcpuO5h72xrAkJzADmjxbav31TZ2Wo3PBvwT0,765
283
- unstructured_ingest/v2/interfaces/downloader.py,sha256=PKT1kr79Mz1urW_8xCyq9sBuK93gDvyTXg5e4ma4htU,2871
326
+ unstructured_ingest/v2/interfaces/connector.py,sha256=qUFFJ3qgDMenTCZMtVRjq1DIwsVak6pxNjQOH2eVkMw,1623
327
+ unstructured_ingest/v2/interfaces/downloader.py,sha256=Lj3nTY1hPA71GfNeedFVCdHdZsHLle8qrx5RtXAy9GY,2940
284
328
  unstructured_ingest/v2/interfaces/file_data.py,sha256=ieJK-hqHCEOmoYNGoFbCHziSaZyMtRS9VpSoYbwoKCE,1944
285
329
  unstructured_ingest/v2/interfaces/indexer.py,sha256=Bd1S-gTLsxhJBLEh1lYm_gXqwQLaEZMoqPq9yGxtN_E,713
286
330
  unstructured_ingest/v2/interfaces/process.py,sha256=BgglTu5K93FnDDopZKKr_rkK2LTZOguR6kcQjKHjF40,392
@@ -302,34 +346,40 @@ unstructured_ingest/v2/pipeline/steps/stage.py,sha256=cphKgHScLz2rNLZRI5Olsb6dAH
302
346
  unstructured_ingest/v2/pipeline/steps/uncompress.py,sha256=CFSy4tGp6BAvF0oIwWFN8v4zFzh5pRDeESjEn5iP9hE,1756
303
347
  unstructured_ingest/v2/pipeline/steps/upload.py,sha256=zlgXgwReX9TBOdfTpS9hETah4SeSmzPB2g8dAGfLIvM,1987
304
348
  unstructured_ingest/v2/processes/__init__.py,sha256=FaHWSCGyc7GWVnAsNEUUj7L8hT8gCVY3_hUE2VzWtUg,462
305
- unstructured_ingest/v2/processes/chunker.py,sha256=76PrpCSd8k3DpfdZcl8I10u7vciKzhSV9ZByrrp302g,5476
349
+ unstructured_ingest/v2/processes/chunker.py,sha256=1bfJ2qgl6qu2HvClzHbC7-q5QtUp7mrlNxZxnPGYTm0,5479
306
350
  unstructured_ingest/v2/processes/connector_registry.py,sha256=vkEe6jpgdYtZCxMj59s5atWGgmPuxAEXRUoTt-MJ7wc,2198
307
- unstructured_ingest/v2/processes/embedder.py,sha256=nFYiOmIJwWLodBt_cC-E5h7zmYB9t3hLu2BWtBStm3g,5977
351
+ unstructured_ingest/v2/processes/embedder.py,sha256=PQn0IO8xbGRQHpcT2VVl-J8gTJ5HGGEP9gdEAwMVK3U,6498
308
352
  unstructured_ingest/v2/processes/filter.py,sha256=kjUmMw2SDq2bme0JCAOxs6cJriIG6Ty09KOznS-xz08,2145
309
- unstructured_ingest/v2/processes/partitioner.py,sha256=bpqmZDsKKi6qtxNWdIWBfQmr1ccQUhU0axecpGAUf_4,7739
353
+ unstructured_ingest/v2/processes/partitioner.py,sha256=2Lhztd730soVC2TOqrn_ba7CGZna8AHHpqJY2ZUYVxE,7776
310
354
  unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2jLuBge8KoK4nxhqKag,2425
311
- unstructured_ingest/v2/processes/connectors/__init__.py,sha256=6iBdoH6BW8oMK1ZvEi0IgEchuk0cNUPoNIaikpzeML8,4992
355
+ unstructured_ingest/v2/processes/connectors/__init__.py,sha256=glyowqb93_NNreQXoRLbF0PvzMc6Ptv0ARfl3xfSH4E,4967
312
356
  unstructured_ingest/v2/processes/connectors/airtable.py,sha256=Yi7PEv_FejZ9_y3BPY3gu5YGVfeLh-9YX-qLyQHjJsY,8921
313
357
  unstructured_ingest/v2/processes/connectors/astradb.py,sha256=ZctZRfXcOAMBGPkKgHvhTmV_-2F0YN5vqwfY9UCHIlU,5791
314
358
  unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py,sha256=S55v7TXu30rEdgythMBB_2VcuomyMPmcPtLYykbhw_E,8466
315
359
  unstructured_ingest/v2/processes/connectors/chroma.py,sha256=skrxRPHZ8y3JxNa0dt5SVitHiDQ5WVxLvY_kh2-QUrQ,8029
316
360
  unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=SONLywyEfoAlLc-HPabXeGzoiwKnekMHIbRMXd4CGXs,12146
317
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py,sha256=E_4DzeemC4mhZsVuLmSXtfy4MR1MoU6CNyvpRqsKnJU,6030
361
+ unstructured_ingest/v2/processes/connectors/databricks_volumes.py,sha256=BQHHpCDwE51inD3pZF4tL4zLr7lv6iBcwnA1NazrHqY,9423
318
362
  unstructured_ingest/v2/processes/connectors/elasticsearch.py,sha256=ojxMUHkLa6ZG50aTGn2YWhDHZ1n38uFRn5p8_ghAIvM,16762
319
363
  unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=7xOQthcqBd9auJxB0nxZlhh1vdjXpMX_CtQZa6YfZz0,13088
320
364
  unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=D71gt8fsPOXi2-Rir8mATw6dRM3BdzYGnn62qG1iaBw,5586
321
365
  unstructured_ingest/v2/processes/connectors/local.py,sha256=a3stgnIkhBbXPIQD0O-RaRM-Eb-szHj9Yy4Fz881-9c,6723
322
366
  unstructured_ingest/v2/processes/connectors/milvus.py,sha256=ZUlyAQyTt0U1JoapFYHQW3IIaGYY50b3URDSLEAFjtk,7687
323
- unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=2_R_hrEAaTU4vJTCK9oKblWTgv6BKjyUhFtC7uq3q2w,4859
367
+ unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=A0pt6JcNTD5bEu79jZ8KhnHcBQ2VUJ2AjtQAtdFr_Lo,13175
324
368
  unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=ZiUo-dFo1LMOvFwphSLRZiR1PcrN8GWLTHhsh4TU6n0,9207
325
369
  unstructured_ingest/v2/processes/connectors/opensearch.py,sha256=dfDSNrWIEk19wuHdlMJpp_SLMOteNPlkDBPlAwu1LVY,6767
326
- unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=8St-JaVrDdQEVZpRS_TfjFusfjg0bAg3IYyykGFyWdw,7169
370
+ unstructured_ingest/v2/processes/connectors/outlook.py,sha256=NK67Pd8Nk5oUIXTK-sK18K7rZ_Cl0UuCbeF2ExBEZho,9294
371
+ unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=k_GH55S_OQ6-wCLC6gkhRrNpXIFECYZ_2Gjz_XRtY6Y,7561
327
372
  unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
328
373
  unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=hOaV5gBcHFc6N5Rbu3MgM-5Aol1ht-QkNIN4PqjvfxE,19665
329
374
  unstructured_ingest/v2/processes/connectors/singlestore.py,sha256=4rVvWKK2iQr03Ff6cB5zjfE1MpN0JyIGpCxxFCDI6hc,5563
330
- unstructured_ingest/v2/processes/connectors/sql.py,sha256=srj2ECKnkGR_iEFBdpa8sxw3ACCvJ5L0uoKCuHxKUe4,9204
331
375
  unstructured_ingest/v2/processes/connectors/utils.py,sha256=8kd0g7lo9NqnpaIkjeO-Ut6erhwUNH_gS9koevpe3WE,878
332
376
  unstructured_ingest/v2/processes/connectors/weaviate.py,sha256=Ss0YyD5T6k-00eJ6dr5lSo2H0LcOjVTMmozehyTvnAo,8866
377
+ unstructured_ingest/v2/processes/connectors/databricks/__init__.py,sha256=jO71UTC7bLA_N12CrLWJzh_yZML5gfT7VohxzCpUGWg,1848
378
+ unstructured_ingest/v2/processes/connectors/databricks/volumes.py,sha256=db4PxE1LiKWUq0b9THABFRChArAfHps89pZBglqEg3c,6521
379
+ unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py,sha256=I1MJwe5LOxoPLjwo00H0XbXO6u_SJHWYgsj4s6ePoyI,2754
380
+ unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py,sha256=P4rfcE3td7WyuuguRgUnGQytCMDpfeYrrpshBZuVynY,3539
381
+ unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=UUotY_-HpgSEJkvdQfZTlbxY7CRLZ4ctL8TlryeFvxk,2790
382
+ unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py,sha256=Wk7s2_u5G0BOV5slvGc8IlUf7ivznY9PrgPqe6nlJKM,2897
333
383
  unstructured_ingest/v2/processes/connectors/fsspec/__init__.py,sha256=TtdeImM7Ypl_n6sl7I1JqX6bGSG0t_FqvCqE3Cy24og,1846
334
384
  unstructured_ingest/v2/processes/connectors/fsspec/azure.py,sha256=Y01BuVRql0Kvzc_cdaZE9dDGYjJzrwJu-etfUrEGcUU,7061
335
385
  unstructured_ingest/v2/processes/connectors/fsspec/box.py,sha256=Cjk0LUxqOCDbme0GmnD_5_b1hfStjI23cKw6BquKNrg,5488
@@ -339,9 +389,13 @@ unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=-_pYHbsBG9FyRyN
339
389
  unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=je1BDqFWlyMfPa4oAMMNFQLLQtCY9quuqx3xjTwF8OQ,6251
340
390
  unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=dwpyqDq0qceCBWX3zM1hiUlgXB4hzX6ObOr-sh-5CJs,6926
341
391
  unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
342
- unstructured_ingest-0.0.24.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
343
- unstructured_ingest-0.0.24.dist-info/METADATA,sha256=rHTF8fy1vNg5NmCBNVdobYWeGgpn_PBKao2z54UbgnE,7108
344
- unstructured_ingest-0.0.24.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
345
- unstructured_ingest-0.0.24.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
346
- unstructured_ingest-0.0.24.dist-info/top_level.txt,sha256=QaTxTcjfM5Hr9sZJ6weOJvSe5ESQc0F8AWkhHInTCf8,20
347
- unstructured_ingest-0.0.24.dist-info/RECORD,,
392
+ unstructured_ingest/v2/processes/connectors/sql/__init__.py,sha256=tr3SZH0tz04XSxqGRkUu__tL_0zn0bSms2jILE-3Rug,543
393
+ unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=hqNuGYR_9o5LmfVDXnm3jBF5Pk-s7R66d0epF2uBYuM,4083
394
+ unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=8bDUgyDurQelOabNnSG6ejWWsnLGWf-A-lWrpwYDGQM,5140
395
+ unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=K-Lquxxqa1m5fk9by-5sasq561TRFAeV_SZ1Hc_b9Hk,3426
396
+ unstructured_ingest-0.1.0.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
397
+ unstructured_ingest-0.1.0.dist-info/METADATA,sha256=mNOS5HjbygWcTZ5eFlxoPpvt6dVAjkYniNHpk6tLvQw,7181
398
+ unstructured_ingest-0.1.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
399
+ unstructured_ingest-0.1.0.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
400
+ unstructured_ingest-0.1.0.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
401
+ unstructured_ingest-0.1.0.dist-info/RECORD,,
@@ -1,275 +0,0 @@
1
- import json
2
- import uuid
3
- from dataclasses import dataclass, field
4
- from datetime import date, datetime
5
- from pathlib import Path
6
- from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union
7
-
8
- import numpy as np
9
- import pandas as pd
10
- from dateutil import parser
11
- from pydantic import Field, Secret
12
-
13
- from unstructured_ingest.error import DestinationConnectionError
14
- from unstructured_ingest.utils.dep_check import requires_dependencies
15
- from unstructured_ingest.v2.interfaces import (
16
- AccessConfig,
17
- ConnectionConfig,
18
- FileData,
19
- Uploader,
20
- UploaderConfig,
21
- UploadStager,
22
- UploadStagerConfig,
23
- )
24
- from unstructured_ingest.v2.logger import logger
25
- from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
26
-
27
- if TYPE_CHECKING:
28
- from sqlite3 import Connection as SqliteConnection
29
-
30
- from psycopg2.extensions import connection as PostgresConnection
31
-
32
- CONNECTOR_TYPE = "sql"
33
- ELEMENTS_TABLE_NAME = "elements"
34
- SQLITE_DB = "sqlite"
35
- POSTGRESQL_DB = "postgresql"
36
-
37
-
38
- class SQLAccessConfig(AccessConfig):
39
- username: Optional[str] = Field(default=None, description="DB username")
40
- password: Optional[str] = Field(default=None, description="DB password")
41
-
42
-
43
- class SQLConnectionConfig(ConnectionConfig):
44
- db_type: Literal["sqlite", "postgresql"] = Field(
45
- default=SQLITE_DB, description="Type of the database backend"
46
- )
47
- database: Optional[str] = Field(
48
- default=None,
49
- description="Database name. For sqlite databases, this is the path to the .db file.",
50
- )
51
- host: Optional[str] = Field(default=None, description="DB host")
52
- port: Optional[int] = Field(default=5432, description="DB host connection port")
53
- access_config: Secret[SQLAccessConfig] = Field(default=SQLAccessConfig(), validate_default=True)
54
- connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
55
-
56
- def __post_init__(self):
57
- if (self.db_type == SQLITE_DB) and (self.database is None):
58
- raise ValueError(
59
- "A sqlite connection requires a path to a *.db file "
60
- "through the `database` argument"
61
- )
62
-
63
-
64
- class SQLUploadStagerConfig(UploadStagerConfig):
65
- pass
66
-
67
-
68
- _COLUMNS = (
69
- "id",
70
- "element_id",
71
- "text",
72
- "embeddings",
73
- "type",
74
- "system",
75
- "layout_width",
76
- "layout_height",
77
- "points",
78
- "url",
79
- "version",
80
- "date_created",
81
- "date_modified",
82
- "date_processed",
83
- "permissions_data",
84
- "record_locator",
85
- "category_depth",
86
- "parent_id",
87
- "attached_filename",
88
- "filetype",
89
- "last_modified",
90
- "file_directory",
91
- "filename",
92
- "languages",
93
- "page_number",
94
- "links",
95
- "page_name",
96
- "link_urls",
97
- "link_texts",
98
- "sent_from",
99
- "sent_to",
100
- "subject",
101
- "section",
102
- "header_footer_type",
103
- "emphasized_text_contents",
104
- "emphasized_text_tags",
105
- "text_as_html",
106
- "regex_metadata",
107
- "detection_class_prob",
108
- )
109
-
110
- _DATE_COLUMNS = ("date_created", "date_modified", "date_processed", "last_modified")
111
-
112
-
113
- def parse_date_string(date_value: Union[str, int]) -> date:
114
- try:
115
- timestamp = float(date_value) / 1000 if isinstance(date_value, int) else float(date_value)
116
- return datetime.fromtimestamp(timestamp)
117
- except Exception as e:
118
- logger.debug(f"date {date_value} string not a timestamp: {e}")
119
- return parser.parse(date_value)
120
-
121
-
122
- @dataclass
123
- class SQLUploadStager(UploadStager):
124
- upload_stager_config: SQLUploadStagerConfig = field(
125
- default_factory=lambda: SQLUploadStagerConfig()
126
- )
127
-
128
- def run(
129
- self,
130
- elements_filepath: Path,
131
- file_data: FileData,
132
- output_dir: Path,
133
- output_filename: str,
134
- **kwargs: Any,
135
- ) -> Path:
136
- with open(elements_filepath) as elements_file:
137
- elements_contents: list[dict] = json.load(elements_file)
138
- output_path = Path(output_dir) / Path(f"{output_filename}.json")
139
- output_path.parent.mkdir(parents=True, exist_ok=True)
140
-
141
- output = []
142
- for data in elements_contents:
143
- metadata: dict[str, Any] = data.pop("metadata", {})
144
- data_source = metadata.pop("data_source", {})
145
- coordinates = metadata.pop("coordinates", {})
146
-
147
- data.update(metadata)
148
- data.update(data_source)
149
- data.update(coordinates)
150
-
151
- data["id"] = str(uuid.uuid4())
152
-
153
- # remove extraneous, not supported columns
154
- data = {k: v for k, v in data.items() if k in _COLUMNS}
155
-
156
- output.append(data)
157
-
158
- df = pd.DataFrame.from_dict(output)
159
- for column in filter(lambda x: x in df.columns, _DATE_COLUMNS):
160
- df[column] = df[column].apply(parse_date_string)
161
- for column in filter(
162
- lambda x: x in df.columns,
163
- ("permissions_data", "record_locator", "points", "links"),
164
- ):
165
- df[column] = df[column].apply(
166
- lambda x: json.dumps(x) if isinstance(x, (list, dict)) else None
167
- )
168
- for column in filter(
169
- lambda x: x in df.columns,
170
- ("version", "page_number", "regex_metadata"),
171
- ):
172
- df[column] = df[column].apply(str)
173
-
174
- with output_path.open("w") as output_file:
175
- df.to_json(output_file, orient="records", lines=True)
176
- return output_path
177
-
178
-
179
- class SQLUploaderConfig(UploaderConfig):
180
- batch_size: int = Field(default=50, description="Number of records per batch")
181
-
182
-
183
- @dataclass
184
- class SQLUploader(Uploader):
185
- connector_type: str = CONNECTOR_TYPE
186
- upload_config: SQLUploaderConfig
187
- connection_config: SQLConnectionConfig
188
-
189
- def precheck(self) -> None:
190
- try:
191
- cursor = self.connection().cursor()
192
- cursor.execute("SELECT 1;")
193
- cursor.close()
194
- except Exception as e:
195
- logger.error(f"failed to validate connection: {e}", exc_info=True)
196
- raise DestinationConnectionError(f"failed to validate connection: {e}")
197
-
198
- @property
199
- def connection(self) -> Callable[[], Union["SqliteConnection", "PostgresConnection"]]:
200
- if self.connection_config.db_type == POSTGRESQL_DB:
201
- return self._make_psycopg_connection
202
- elif self.connection_config.db_type == SQLITE_DB:
203
- return self._make_sqlite_connection
204
- raise ValueError(f"Unsupported database {self.connection_config.db_type} connection.")
205
-
206
- def _make_sqlite_connection(self) -> "SqliteConnection":
207
- from sqlite3 import connect
208
-
209
- return connect(database=self.connection_config.database)
210
-
211
- @requires_dependencies(["psycopg2"], extras="postgres")
212
- def _make_psycopg_connection(self) -> "PostgresConnection":
213
- from psycopg2 import connect
214
-
215
- access_config = self.connection_config.access_config.get_secret_value()
216
- return connect(
217
- user=access_config.username,
218
- password=access_config.password,
219
- dbname=self.connection_config.database,
220
- host=self.connection_config.host,
221
- port=self.connection_config.port,
222
- )
223
-
224
- def prepare_data(
225
- self, columns: list[str], data: tuple[tuple[Any, ...], ...]
226
- ) -> list[tuple[Any, ...]]:
227
- output = []
228
- for row in data:
229
- parsed = []
230
- for column_name, value in zip(columns, row):
231
- if self.connection_config.db_type == SQLITE_DB and isinstance(value, (list, dict)):
232
- value = json.dumps(value)
233
- if column_name in _DATE_COLUMNS:
234
- if value is None:
235
- parsed.append(None)
236
- else:
237
- parsed.append(parse_date_string(value))
238
- else:
239
- parsed.append(value)
240
- output.append(tuple(parsed))
241
- return output
242
-
243
- def upload_contents(self, path: Path) -> None:
244
- df = pd.read_json(path, orient="records", lines=True)
245
- logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
246
- df.replace({np.nan: None}, inplace=True)
247
-
248
- columns = tuple(df.columns)
249
- stmt = f"INSERT INTO {ELEMENTS_TABLE_NAME} ({','.join(columns)}) \
250
- VALUES({','.join(['?' if self.connection_config.db_type==SQLITE_DB else '%s' for x in columns])})" # noqa E501
251
-
252
- for rows in pd.read_json(
253
- path, orient="records", lines=True, chunksize=self.upload_config.batch_size
254
- ):
255
- with self.connection() as conn:
256
- values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
257
- if self.connection_config.db_type == SQLITE_DB:
258
- conn.executemany(stmt, values)
259
- else:
260
- with conn.cursor() as cur:
261
- cur.executemany(stmt, values)
262
-
263
- conn.commit()
264
-
265
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
266
- self.upload_contents(path=path)
267
-
268
-
269
- sql_destination_entry = DestinationRegistryEntry(
270
- connection_config=SQLConnectionConfig,
271
- uploader=SQLUploader,
272
- uploader_config=SQLUploaderConfig,
273
- upload_stager=SQLUploadStager,
274
- upload_stager_config=SQLUploadStagerConfig,
275
- )