unstructured-ingest 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (30) hide show
  1. test/integration/connectors/conftest.py +13 -0
  2. test/integration/connectors/databricks_tests/test_volumes_native.py +8 -4
  3. test/integration/connectors/sql/test_postgres.py +6 -10
  4. test/integration/connectors/sql/test_snowflake.py +205 -0
  5. test/integration/connectors/sql/test_sqlite.py +6 -10
  6. test/integration/connectors/test_delta_table.py +138 -0
  7. test/integration/connectors/utils/docker.py +78 -0
  8. test/integration/connectors/utils/validation.py +93 -2
  9. unstructured_ingest/__version__.py +1 -1
  10. unstructured_ingest/v2/cli/utils/click.py +32 -1
  11. unstructured_ingest/v2/cli/utils/model_conversion.py +10 -3
  12. unstructured_ingest/v2/interfaces/indexer.py +4 -1
  13. unstructured_ingest/v2/pipeline/pipeline.py +10 -2
  14. unstructured_ingest/v2/pipeline/steps/index.py +18 -1
  15. unstructured_ingest/v2/processes/connectors/__init__.py +10 -0
  16. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +1 -1
  17. unstructured_ingest/v2/processes/connectors/delta_table.py +185 -0
  18. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  19. unstructured_ingest/v2/processes/connectors/sql/__init__.py +10 -2
  20. unstructured_ingest/v2/processes/connectors/sql/postgres.py +33 -37
  21. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +164 -0
  22. unstructured_ingest/v2/processes/connectors/sql/sql.py +38 -10
  23. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +31 -32
  24. {unstructured_ingest-0.1.1.dist-info → unstructured_ingest-0.2.0.dist-info}/METADATA +14 -12
  25. {unstructured_ingest-0.1.1.dist-info → unstructured_ingest-0.2.0.dist-info}/RECORD +29 -24
  26. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +0 -250
  27. {unstructured_ingest-0.1.1.dist-info → unstructured_ingest-0.2.0.dist-info}/LICENSE.md +0 -0
  28. {unstructured_ingest-0.1.1.dist-info → unstructured_ingest-0.2.0.dist-info}/WHEEL +0 -0
  29. {unstructured_ingest-0.1.1.dist-info → unstructured_ingest-0.2.0.dist-info}/entry_points.txt +0 -0
  30. {unstructured_ingest-0.1.1.dist-info → unstructured_ingest-0.2.0.dist-info}/top_level.txt +0 -0
@@ -1,15 +1,17 @@
1
1
  import json
2
+ from contextlib import contextmanager
2
3
  from dataclasses import dataclass, field
3
4
  from pathlib import Path
4
- from typing import TYPE_CHECKING, Any
5
+ from typing import TYPE_CHECKING, Any, Generator
5
6
 
6
- import numpy as np
7
- import pandas as pd
8
7
  from pydantic import Field, Secret, model_validator
9
8
 
10
9
  from unstructured_ingest.v2.interfaces import FileData
11
10
  from unstructured_ingest.v2.logger import logger
12
- from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
11
+ from unstructured_ingest.v2.processes.connector_registry import (
12
+ DestinationRegistryEntry,
13
+ SourceRegistryEntry,
14
+ )
13
15
  from unstructured_ingest.v2.processes.connectors.sql.sql import (
14
16
  _DATE_COLUMNS,
15
17
  SQLAccessConfig,
@@ -27,6 +29,7 @@ from unstructured_ingest.v2.processes.connectors.sql.sql import (
27
29
 
28
30
  if TYPE_CHECKING:
29
31
  from sqlite3 import Connection as SqliteConnection
32
+ from sqlite3 import Cursor as SqliteCursor
30
33
 
31
34
  CONNECTOR_TYPE = "sqlite"
32
35
 
@@ -51,10 +54,25 @@ class SQLiteConnectionConfig(SQLConnectionConfig):
51
54
  raise ValueError(f"{self.database_path} is not a valid file")
52
55
  return self
53
56
 
54
- def get_connection(self) -> "SqliteConnection":
57
+ @contextmanager
58
+ def get_connection(self) -> Generator["SqliteConnection", None, None]:
55
59
  from sqlite3 import connect
56
60
 
57
- return connect(database=self.database_path)
61
+ connection = connect(database=self.database_path)
62
+ try:
63
+ yield connection
64
+ finally:
65
+ connection.commit()
66
+ connection.close()
67
+
68
+ @contextmanager
69
+ def get_cursor(self) -> Generator["SqliteCursor", None, None]:
70
+ with self.get_connection() as connection:
71
+ cursor = connection.cursor()
72
+ try:
73
+ yield cursor
74
+ finally:
75
+ cursor.close()
58
76
 
59
77
 
60
78
  class SQLiteIndexerConfig(SQLIndexerConfig):
@@ -67,16 +85,6 @@ class SQLiteIndexer(SQLIndexer):
67
85
  index_config: SQLIndexerConfig
68
86
  connector_type: str = CONNECTOR_TYPE
69
87
 
70
- def _get_doc_ids(self) -> list[str]:
71
- with self.connection_config.get_connection() as sqlite_connection:
72
- cursor = sqlite_connection.cursor()
73
- cursor.execute(
74
- f"SELECT {self.index_config.id_column} FROM {self.index_config.table_name}"
75
- )
76
- results = cursor.fetchall()
77
- ids = [result[0] for result in results]
78
- return ids
79
-
80
88
 
81
89
  class SQLiteDownloaderConfig(SQLDownloaderConfig):
82
90
  pass
@@ -145,23 +153,14 @@ class SQLiteUploader(SQLUploader):
145
153
  output.append(tuple(parsed))
146
154
  return output
147
155
 
148
- def upload_contents(self, path: Path) -> None:
149
- df = pd.read_json(path, orient="records", lines=True)
150
- logger.debug(f"uploading {len(df)} entries to {self.connection_config.database_path} ")
151
- df.replace({np.nan: None}, inplace=True)
152
-
153
- columns = tuple(df.columns)
154
- stmt = f"INSERT INTO {self.upload_config.table_name} ({','.join(columns)}) \
155
- VALUES({','.join(['?' for x in columns])})" # noqa E501
156
-
157
- for rows in pd.read_json(
158
- path, orient="records", lines=True, chunksize=self.upload_config.batch_size
159
- ):
160
- with self.connection_config.get_connection() as conn:
161
- values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
162
- conn.executemany(stmt, values)
163
- conn.commit()
164
156
 
157
+ sqlite_source_entry = SourceRegistryEntry(
158
+ connection_config=SQLiteConnectionConfig,
159
+ indexer_config=SQLiteIndexerConfig,
160
+ indexer=SQLIndexer,
161
+ downloader_config=SQLiteDownloaderConfig,
162
+ downloader=SQLiteDownloader,
163
+ )
165
164
 
166
165
  sqlite_destination_entry = DestinationRegistryEntry(
167
166
  connection_config=SQLiteConnectionConfig,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.1.1
3
+ Version: 0.2.0
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -22,13 +22,13 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.13
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
- Requires-Dist: python-dateutil
26
- Requires-Dist: pandas
27
25
  Requires-Dist: pydantic>=2.7
28
- Requires-Dist: dataclasses-json
29
26
  Requires-Dist: opentelemetry-sdk
30
- Requires-Dist: click
27
+ Requires-Dist: python-dateutil
31
28
  Requires-Dist: tqdm
29
+ Requires-Dist: pandas
30
+ Requires-Dist: click
31
+ Requires-Dist: dataclasses-json
32
32
  Provides-Extra: airtable
33
33
  Requires-Dist: pyairtable; extra == "airtable"
34
34
  Provides-Extra: astradb
@@ -69,8 +69,8 @@ Requires-Dist: unstructured[docx]; extra == "doc"
69
69
  Provides-Extra: docx
70
70
  Requires-Dist: unstructured[docx]; extra == "docx"
71
71
  Provides-Extra: dropbox
72
- Requires-Dist: fsspec; extra == "dropbox"
73
72
  Requires-Dist: dropboxdrivefs; extra == "dropbox"
73
+ Requires-Dist: fsspec; extra == "dropbox"
74
74
  Provides-Extra: elasticsearch
75
75
  Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
76
76
  Provides-Extra: embed-huggingface
@@ -87,8 +87,8 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
87
87
  Provides-Extra: epub
88
88
  Requires-Dist: unstructured[epub]; extra == "epub"
89
89
  Provides-Extra: gcs
90
- Requires-Dist: gcsfs; extra == "gcs"
91
90
  Requires-Dist: bs4; extra == "gcs"
91
+ Requires-Dist: gcsfs; extra == "gcs"
92
92
  Requires-Dist: fsspec; extra == "gcs"
93
93
  Provides-Extra: github
94
94
  Requires-Dist: pygithub>1.58.0; extra == "github"
@@ -115,15 +115,15 @@ Requires-Dist: pymongo; extra == "mongodb"
115
115
  Provides-Extra: msg
116
116
  Requires-Dist: unstructured[msg]; extra == "msg"
117
117
  Provides-Extra: notion
118
- Requires-Dist: notion-client; extra == "notion"
119
- Requires-Dist: htmlBuilder; extra == "notion"
120
118
  Requires-Dist: backoff; extra == "notion"
121
119
  Requires-Dist: httpx; extra == "notion"
120
+ Requires-Dist: notion-client; extra == "notion"
121
+ Requires-Dist: htmlBuilder; extra == "notion"
122
122
  Provides-Extra: odt
123
123
  Requires-Dist: unstructured[odt]; extra == "odt"
124
124
  Provides-Extra: onedrive
125
- Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
126
125
  Requires-Dist: bs4; extra == "onedrive"
126
+ Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
127
127
  Requires-Dist: msal; extra == "onedrive"
128
128
  Provides-Extra: openai
129
129
  Requires-Dist: openai; extra == "openai"
@@ -161,15 +161,17 @@ Requires-Dist: s3fs; extra == "s3"
161
161
  Provides-Extra: salesforce
162
162
  Requires-Dist: simple-salesforce; extra == "salesforce"
163
163
  Provides-Extra: sftp
164
- Requires-Dist: fsspec; extra == "sftp"
165
164
  Requires-Dist: paramiko; extra == "sftp"
165
+ Requires-Dist: fsspec; extra == "sftp"
166
166
  Provides-Extra: sharepoint
167
167
  Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
168
168
  Requires-Dist: msal; extra == "sharepoint"
169
169
  Provides-Extra: singlestore
170
170
  Requires-Dist: singlestoredb; extra == "singlestore"
171
171
  Provides-Extra: slack
172
- Requires-Dist: slack-sdk; extra == "slack"
172
+ Requires-Dist: slack-sdk[optional]; extra == "slack"
173
+ Provides-Extra: snowflake
174
+ Requires-Dist: snowflake; extra == "snowflake"
173
175
  Provides-Extra: togetherai
174
176
  Requires-Dist: together; extra == "togetherai"
175
177
  Provides-Extra: tsv
@@ -4,17 +4,20 @@ test/integration/utils.py,sha256=CWqzEGw6TA_ZoP9hRUkW64TWYssooBbufcTRmbJvod8,401
4
4
  test/integration/chunkers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  test/integration/chunkers/test_chunkers.py,sha256=pqn1Rqh36jZTJL4qpU0iuOMFAEQ-LrKAPOgWtQMAt_I,1482
6
6
  test/integration/connectors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- test/integration/connectors/conftest.py,sha256=Q8ScDzrzO2o-8D_kYFt8LL7QAhoFTRRtKJKMc2hLMcI,345
7
+ test/integration/connectors/conftest.py,sha256=6dVNMBrL6WIO4KXA-0nf2tNrPYk_tsor8uomi6fbi3Q,727
8
+ test/integration/connectors/test_delta_table.py,sha256=4_KPyQJpd6DmyIjjtXWPMw6NNf7xULRkxmqfbvmZ80g,5018
8
9
  test/integration/connectors/test_s3.py,sha256=fK0soCTkNxp-4hm4O2LPrhlZXvYmaeTmeEgeNh1b0k8,5839
9
10
  test/integration/connectors/databricks_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- test/integration/connectors/databricks_tests/test_volumes_native.py,sha256=kS45mnNu9_U4qV3cxByEFXCYLEBWRy-fxxhzR3r93cs,5685
11
+ test/integration/connectors/databricks_tests/test_volumes_native.py,sha256=k4lALbwNtlyuI3wd3OHoBULI21E3Ck2Fo8EJXaVfwgw,5812
11
12
  test/integration/connectors/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
- test/integration/connectors/sql/test_postgres.py,sha256=A9vWj5pBdoEyL2m6d3e2Ep8ZZcnLhdXkaHPPlkTStbg,6581
13
- test/integration/connectors/sql/test_sqlite.py,sha256=F6Ljb6npmFZlq_5pvJj-0Hkk2mC3T-pMAGyhDm1UtM4,5702
13
+ test/integration/connectors/sql/test_postgres.py,sha256=gDBuNyvWmpVPmDrSSYC99z3t17B_a196P1MwIAOp5Dk,6584
14
+ test/integration/connectors/sql/test_snowflake.py,sha256=XXU2-2z_k8jHWP684v2IuaGOlV3cmPpg3RxkwMp08v8,6998
15
+ test/integration/connectors/sql/test_sqlite.py,sha256=51QrFufAq-XxNjHAkmPWxdJUkGdIRRIGKeRT09A5pkA,5704
14
16
  test/integration/connectors/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
17
  test/integration/connectors/utils/constants.py,sha256=0zSPnsZVqJuNhXduXvdXFQLZTRIQa5Fo_1qjBYVCfb8,209
18
+ test/integration/connectors/utils/docker.py,sha256=-wknXRVlzr3BVPdEhCyJgsdNjO9aSb2xjb-mQ306j7Q,2256
16
19
  test/integration/connectors/utils/docker_compose.py,sha256=6XeYOKQFZCBRLEmcgH2mmBAaVs6R6jCWAhJLjq6p-aM,1771
17
- test/integration/connectors/utils/validation.py,sha256=Sf0ELATWG5K3E3d5S_ArtZeFFYdzoI5jN86U4DiqNyw,8422
20
+ test/integration/connectors/utils/validation.py,sha256=gnflehoYbFkSBJdXQV-7HwcrlL_Cuqni2ri1YmArjT0,12019
18
21
  test/integration/embedders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
22
  test/integration/embedders/conftest.py,sha256=B2W771RbijR7G_GybsCzRyIvOzXqzbKZdRIlNDd5AGY,334
20
23
  test/integration/embedders/test_bedrock.py,sha256=0oBRNS_DtFDGQ22Z1T3t6VOJ31PrItgvnJpqcLe9Fg4,1903
@@ -43,7 +46,7 @@ test/unit/embed/test_openai.py,sha256=0O1yshDcE0BMKv1yJqrNuiNLSdPhLpKqJ-D_wmnids
43
46
  test/unit/embed/test_vertexai.py,sha256=Pl7COc9E3tf_yGidkTEmTizNGyZF1F5zuL2TgPTMnfI,1048
44
47
  test/unit/embed/test_voyageai.py,sha256=DviCOJFhe5H4e26-kNyX3JNe8h3qB5Yl0KOe8rQEMrc,981
45
48
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
46
- unstructured_ingest/__version__.py,sha256=ch9Ch304-rlC6iFyomBT7OHb9bvtQNzaejmd5QwbzKE,42
49
+ unstructured_ingest/__version__.py,sha256=BPrBFKCFfY7EcVqYVDVJGmj1rrsGlJa3283pycTFA3o,42
47
50
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
48
51
  unstructured_ingest/interfaces.py,sha256=m03BgenxSA34HbW157L7V9TGxK_dTG7N2AnAhF31W-U,31364
49
52
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -321,13 +324,13 @@ unstructured_ingest/v2/cli/base/dest.py,sha256=zDjqek7anr0JQ2ptEl8KIAsUXuCuHRnBQ
321
324
  unstructured_ingest/v2/cli/base/importer.py,sha256=nRt0QQ3qpi264-n_mR0l55C2ddM8nowTNzT1jsWaam8,1128
322
325
  unstructured_ingest/v2/cli/base/src.py,sha256=cpQ43qQju4e5s_YSaPxUtA70BaisRkTBdjtlPhqn5Mg,2872
323
326
  unstructured_ingest/v2/cli/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
324
- unstructured_ingest/v2/cli/utils/click.py,sha256=HCEcdHf8Lck0zcx3kidKjLbHDHXIBxPRL2MGgtKtDlg,6967
325
- unstructured_ingest/v2/cli/utils/model_conversion.py,sha256=uJQKpbTC5ysOdVaRq2SWEjG8btBimVZYzX9NVL7xnzs,7500
327
+ unstructured_ingest/v2/cli/utils/click.py,sha256=1_eJgrwS2DFBl1jZPLsj1vgVgR7agFBIEBe4A_n7mH4,7827
328
+ unstructured_ingest/v2/cli/utils/model_conversion.py,sha256=7eEIkk1KU51-ZNiIfI1KRxlwITNW1xl1YxMAG8BcTk0,7604
326
329
  unstructured_ingest/v2/interfaces/__init__.py,sha256=Rfa8crx6De7WNOK-EjsWWwFVpsUfCc6gY8B8tQ3ae9I,899
327
330
  unstructured_ingest/v2/interfaces/connector.py,sha256=qUFFJ3qgDMenTCZMtVRjq1DIwsVak6pxNjQOH2eVkMw,1623
328
331
  unstructured_ingest/v2/interfaces/downloader.py,sha256=Lj3nTY1hPA71GfNeedFVCdHdZsHLle8qrx5RtXAy9GY,2940
329
332
  unstructured_ingest/v2/interfaces/file_data.py,sha256=ieJK-hqHCEOmoYNGoFbCHziSaZyMtRS9VpSoYbwoKCE,1944
330
- unstructured_ingest/v2/interfaces/indexer.py,sha256=Bd1S-gTLsxhJBLEh1lYm_gXqwQLaEZMoqPq9yGxtN_E,713
333
+ unstructured_ingest/v2/interfaces/indexer.py,sha256=gsa1MLhFa82BzD2h4Yb7ons0VxRwKINZOrzvHAahwVU,846
331
334
  unstructured_ingest/v2/interfaces/process.py,sha256=BgglTu5K93FnDDopZKKr_rkK2LTZOguR6kcQjKHjF40,392
332
335
  unstructured_ingest/v2/interfaces/processor.py,sha256=VX7JqXlbG1plxMK8THWhWINPbTICaaUEk4XUXhnOixY,3303
333
336
  unstructured_ingest/v2/interfaces/upload_stager.py,sha256=ZFkDxcwKn-6EPrTbdBEgOkz1kGAq4gUtze98KP48KG4,1146
@@ -335,13 +338,13 @@ unstructured_ingest/v2/interfaces/uploader.py,sha256=JmZDl1blJa5rS61YHCae3Hfet84
335
338
  unstructured_ingest/v2/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
336
339
  unstructured_ingest/v2/pipeline/interfaces.py,sha256=-Y6gPnl-SbNxIx5-dQCmiYSPKUMjivrRlBLIKIUWVeM,8658
337
340
  unstructured_ingest/v2/pipeline/otel.py,sha256=K3pQvWVgWzyOWMKCBUofsH7wTZPJ0Ysw5sLjMBLW41I,1088
338
- unstructured_ingest/v2/pipeline/pipeline.py,sha256=x6hanD7Cj7Wd5MBUvb33UwXQMZxubzwlAiYyBCMukuc,15693
341
+ unstructured_ingest/v2/pipeline/pipeline.py,sha256=7Yg8_xwlSX6lA-oPGlTcn6KXZ9kc51zsoJxME5TiUlw,15956
339
342
  unstructured_ingest/v2/pipeline/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
340
343
  unstructured_ingest/v2/pipeline/steps/chunk.py,sha256=rYVcHSXeQSzWszg6VmtYlNc66Gsx-22Ti0BxPyQaJak,3135
341
344
  unstructured_ingest/v2/pipeline/steps/download.py,sha256=lzvOl5SoUK6OCCVVeG4CzdPIGj6eKKCGdciNo_0RMNk,8173
342
345
  unstructured_ingest/v2/pipeline/steps/embed.py,sha256=-YFvmchdsonWiSXxaD7PJfuUUtMLklaQM_8kZCQxCdM,3113
343
346
  unstructured_ingest/v2/pipeline/steps/filter.py,sha256=q7bNieaFMprqoF8Mx7w-ZN6jyA5peiGeTGyPtvcV-uw,1199
344
- unstructured_ingest/v2/pipeline/steps/index.py,sha256=nfDo-wt5sooKtMHKG7sI42m1L44uw-pxErDlDB1engw,2678
347
+ unstructured_ingest/v2/pipeline/steps/index.py,sha256=YUUf1sYZRZSrRgapca3Sfzk1sNPJ05yyTQ5wKlyDjEo,3543
345
348
  unstructured_ingest/v2/pipeline/steps/partition.py,sha256=9MQViptxK3ALKco8uE4gK9PpEoGq5JjzyU14C_18blU,3193
346
349
  unstructured_ingest/v2/pipeline/steps/stage.py,sha256=cphKgHScLz2rNLZRI5Olsb6dAH-MKGu3p6MYS1BEzkA,2246
347
350
  unstructured_ingest/v2/pipeline/steps/uncompress.py,sha256=CFSy4tGp6BAvF0oIwWFN8v4zFzh5pRDeESjEn5iP9hE,1756
@@ -353,13 +356,13 @@ unstructured_ingest/v2/processes/embedder.py,sha256=PQn0IO8xbGRQHpcT2VVl-J8gTJ5H
353
356
  unstructured_ingest/v2/processes/filter.py,sha256=kjUmMw2SDq2bme0JCAOxs6cJriIG6Ty09KOznS-xz08,2145
354
357
  unstructured_ingest/v2/processes/partitioner.py,sha256=2Lhztd730soVC2TOqrn_ba7CGZna8AHHpqJY2ZUYVxE,7776
355
358
  unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2jLuBge8KoK4nxhqKag,2425
356
- unstructured_ingest/v2/processes/connectors/__init__.py,sha256=glyowqb93_NNreQXoRLbF0PvzMc6Ptv0ARfl3xfSH4E,4967
359
+ unstructured_ingest/v2/processes/connectors/__init__.py,sha256=a7L4N7A2-SzthS6-42FKWymQRW1ydr0cGvDdI2QE--I,5377
357
360
  unstructured_ingest/v2/processes/connectors/airtable.py,sha256=Yi7PEv_FejZ9_y3BPY3gu5YGVfeLh-9YX-qLyQHjJsY,8921
358
361
  unstructured_ingest/v2/processes/connectors/astradb.py,sha256=ZctZRfXcOAMBGPkKgHvhTmV_-2F0YN5vqwfY9UCHIlU,5791
359
362
  unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py,sha256=S55v7TXu30rEdgythMBB_2VcuomyMPmcPtLYykbhw_E,8466
360
363
  unstructured_ingest/v2/processes/connectors/chroma.py,sha256=skrxRPHZ8y3JxNa0dt5SVitHiDQ5WVxLvY_kh2-QUrQ,8029
361
364
  unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=SONLywyEfoAlLc-HPabXeGzoiwKnekMHIbRMXd4CGXs,12146
362
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py,sha256=BQHHpCDwE51inD3pZF4tL4zLr7lv6iBcwnA1NazrHqY,9423
365
+ unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=ZZfdNTw1W0ISQGWCtM1JuIME26FYzuPBOqRKql0wlLg,7013
363
366
  unstructured_ingest/v2/processes/connectors/elasticsearch.py,sha256=ojxMUHkLa6ZG50aTGn2YWhDHZ1n38uFRn5p8_ghAIvM,16762
364
367
  unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=7xOQthcqBd9auJxB0nxZlhh1vdjXpMX_CtQZa6YfZz0,13088
365
368
  unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=8bGHbZctJ_Tl1AUSMnI7CCZ7CgEtTRVcRuvlB1HPlqQ,5907
@@ -373,10 +376,11 @@ unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=k_GH55S_OQ6-wCLC6
373
376
  unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
374
377
  unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=hOaV5gBcHFc6N5Rbu3MgM-5Aol1ht-QkNIN4PqjvfxE,19665
375
378
  unstructured_ingest/v2/processes/connectors/singlestore.py,sha256=4rVvWKK2iQr03Ff6cB5zjfE1MpN0JyIGpCxxFCDI6hc,5563
379
+ unstructured_ingest/v2/processes/connectors/slack.py,sha256=b9IanzUApUexiJzuNg7PR3tujOoeG8dhM0L0v4MDuPw,9256
376
380
  unstructured_ingest/v2/processes/connectors/utils.py,sha256=8kd0g7lo9NqnpaIkjeO-Ut6erhwUNH_gS9koevpe3WE,878
377
381
  unstructured_ingest/v2/processes/connectors/weaviate.py,sha256=Ss0YyD5T6k-00eJ6dr5lSo2H0LcOjVTMmozehyTvnAo,8866
378
382
  unstructured_ingest/v2/processes/connectors/databricks/__init__.py,sha256=jO71UTC7bLA_N12CrLWJzh_yZML5gfT7VohxzCpUGWg,1848
379
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py,sha256=db4PxE1LiKWUq0b9THABFRChArAfHps89pZBglqEg3c,6521
383
+ unstructured_ingest/v2/processes/connectors/databricks/volumes.py,sha256=IBCGt6BQ7vULkPI3jTJZ52emwYg7QeyLZXjOFz9SO3E,6549
380
384
  unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py,sha256=I1MJwe5LOxoPLjwo00H0XbXO6u_SJHWYgsj4s6ePoyI,2754
381
385
  unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py,sha256=P4rfcE3td7WyuuguRgUnGQytCMDpfeYrrpshBZuVynY,3539
382
386
  unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=UUotY_-HpgSEJkvdQfZTlbxY7CRLZ4ctL8TlryeFvxk,2790
@@ -390,13 +394,14 @@ unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=-_pYHbsBG9FyRyN
390
394
  unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=je1BDqFWlyMfPa4oAMMNFQLLQtCY9quuqx3xjTwF8OQ,6251
391
395
  unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=dwpyqDq0qceCBWX3zM1hiUlgXB4hzX6ObOr-sh-5CJs,6926
392
396
  unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
393
- unstructured_ingest/v2/processes/connectors/sql/__init__.py,sha256=tr3SZH0tz04XSxqGRkUu__tL_0zn0bSms2jILE-3Rug,543
394
- unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=oMwfYCycX-jTSKW-c6o6K09aU74Wn1B_G3Ib20oYi1A,6050
395
- unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=MbSvYSjhgGj8HHI7P-gH5bQ0Lqxtf8BEFsKNmCUfzug,9807
396
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=LxC2Q_rPHytbTDflmWzj4H5Jx-41phKnfp6FCpDe-UY,5701
397
- unstructured_ingest-0.1.1.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
398
- unstructured_ingest-0.1.1.dist-info/METADATA,sha256=LQ_M1kX7q7rGBvslwml9KbrJGJHAaA_SLWM64BBaZrg,7188
399
- unstructured_ingest-0.1.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
400
- unstructured_ingest-0.1.1.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
401
- unstructured_ingest-0.1.1.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
402
- unstructured_ingest-0.1.1.dist-info/RECORD,,
397
+ unstructured_ingest/v2/processes/connectors/sql/__init__.py,sha256=hdGD-V4U3RgnVoJV5S3exKVUfzCLLY7wTwKWvVaihJs,1098
398
+ unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=WUqyjzjmuVvLKCMKnhFhYNRAAQs_cFh0DkSXAJEERyU,5548
399
+ unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=AcW2TxEalYj6c8fhrOWB78JlaB-1hApmdDzCUhQlzW4,5513
400
+ unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=XdMJRgQvcR4Lo2Udl1y8-ZkJw6nVrcXTL-gTsaAHAJw,11196
401
+ unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=9605K36nQ5-gBxzt1daYKYotON1SE85RETusqCJrbdk,5230
402
+ unstructured_ingest-0.2.0.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
403
+ unstructured_ingest-0.2.0.dist-info/METADATA,sha256=F8s5t23zy5zdxICEj6BseR0teRWCQc7IjB_xtlZUkaM,7271
404
+ unstructured_ingest-0.2.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
405
+ unstructured_ingest-0.2.0.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
406
+ unstructured_ingest-0.2.0.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
407
+ unstructured_ingest-0.2.0.dist-info/RECORD,,
@@ -1,250 +0,0 @@
1
- import os
2
- from dataclasses import dataclass
3
- from pathlib import Path
4
- from typing import TYPE_CHECKING, Any, Generator, Optional
5
-
6
- from pydantic import Field, Secret
7
-
8
- from unstructured_ingest.error import (
9
- DestinationConnectionError,
10
- SourceConnectionError,
11
- SourceConnectionNetworkError,
12
- )
13
- from unstructured_ingest.utils.dep_check import requires_dependencies
14
- from unstructured_ingest.v2.interfaces import (
15
- AccessConfig,
16
- ConnectionConfig,
17
- Downloader,
18
- DownloaderConfig,
19
- DownloadResponse,
20
- FileData,
21
- FileDataSourceMetadata,
22
- Indexer,
23
- IndexerConfig,
24
- SourceIdentifiers,
25
- Uploader,
26
- UploaderConfig,
27
- )
28
- from unstructured_ingest.v2.logger import logger
29
- from unstructured_ingest.v2.processes.connector_registry import (
30
- DestinationRegistryEntry,
31
- SourceRegistryEntry,
32
- )
33
-
34
- if TYPE_CHECKING:
35
- from databricks.sdk import WorkspaceClient
36
-
37
- CONNECTOR_TYPE = "databricks_volumes"
38
-
39
-
40
- class DatabricksVolumesAccessConfig(AccessConfig):
41
- account_id: Optional[str] = Field(
42
- default=None,
43
- description="The Databricks account ID for the Databricks "
44
- "accounts endpoint. Only has effect when Host is "
45
- "either https://accounts.cloud.databricks.com/ (AWS), "
46
- "https://accounts.azuredatabricks.net/ (Azure), "
47
- "or https://accounts.gcp.databricks.com/ (GCP).",
48
- )
49
- client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
50
- client_secret: Optional[str] = Field(
51
- default=None, description="Client Secret of the OAuth app."
52
- )
53
- token: Optional[str] = Field(
54
- default=None,
55
- description="The Databricks personal access token (PAT) (AWS, Azure, and GCP) or "
56
- "Azure Active Directory (Azure AD) token (Azure).",
57
- )
58
- profile: Optional[str] = None
59
- azure_workspace_resource_id: Optional[str] = Field(
60
- default=None,
61
- description="The Azure Resource Manager ID for the Azure Databricks workspace, "
62
- "which is exchanged for a Databricks host URL.",
63
- )
64
- azure_client_secret: Optional[str] = Field(
65
- default=None, description="The Azure AD service principal’s client secret."
66
- )
67
- azure_client_id: Optional[str] = Field(
68
- default=None, description="The Azure AD service principal’s application ID."
69
- )
70
- azure_tenant_id: Optional[str] = Field(
71
- default=None, description="The Azure AD service principal’s tenant ID."
72
- )
73
- azure_environment: Optional[str] = Field(
74
- default=None,
75
- description="The Azure environment type for a " "specific set of API endpoints",
76
- examples=["Public", "UsGov", "China", "Germany"],
77
- )
78
- auth_type: Optional[str] = Field(
79
- default=None,
80
- description="When multiple auth attributes are available in the "
81
- "environment, use the auth type specified by this "
82
- "argument. This argument also holds the currently "
83
- "selected auth.",
84
- )
85
- google_credentials: Optional[str] = None
86
- google_service_account: Optional[str] = None
87
-
88
-
89
- class DatabricksVolumesConnectionConfig(ConnectionConfig):
90
- access_config: Secret[DatabricksVolumesAccessConfig] = Field(
91
- default=DatabricksVolumesAccessConfig(), validate_default=True
92
- )
93
- host: Optional[str] = Field(
94
- default=None,
95
- description="The Databricks host URL for either the "
96
- "Databricks workspace endpoint or the "
97
- "Databricks accounts endpoint.",
98
- )
99
- volume: str = Field(description="Name of volume in the Unity Catalog")
100
- catalog: str = Field(description="Name of the catalog in the Databricks Unity Catalog service")
101
- volume_path: Optional[str] = Field(
102
- default=None, description="Optional path within the volume to write to"
103
- )
104
- databricks_schema: str = Field(
105
- default="default",
106
- alias="schema",
107
- description="Schema associated with the volume to write to in the Unity Catalog service",
108
- )
109
-
110
- @property
111
- def path(self) -> str:
112
- path = f"/Volumes/{self.catalog}/{self.databricks_schema}/{self.volume}"
113
- if self.volume_path:
114
- path = f"{path}/{self.volume_path}"
115
- return path
116
-
117
- @requires_dependencies(dependencies=["databricks.sdk"], extras="databricks-volumes")
118
- def get_client(self) -> "WorkspaceClient":
119
- from databricks.sdk import WorkspaceClient
120
-
121
- return WorkspaceClient(
122
- host=self.host,
123
- **self.access_config.get_secret_value().model_dump(),
124
- )
125
-
126
-
127
- @dataclass
128
- class DatabricksVolumesIndexerConfig(IndexerConfig):
129
- recursive: bool = False
130
-
131
-
132
- @dataclass
133
- class DatabricksVolumesIndexer(Indexer):
134
- index_config: DatabricksVolumesIndexerConfig
135
- connection_config: DatabricksVolumesConnectionConfig
136
- connector_type: str = CONNECTOR_TYPE
137
-
138
- def precheck(self) -> None:
139
- try:
140
- self.connection_config.get_client()
141
- except Exception as e:
142
- logger.error(f"failed to validate connection: {e}", exc_info=True)
143
- raise SourceConnectionError(f"failed to validate connection: {e}")
144
-
145
- def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
146
- for file_info in self.connection_config.get_client().dbfs.list(
147
- path=self.connection_config.path, recursive=self.index_config.recursive
148
- ):
149
- if file_info.is_dir:
150
- continue
151
- rel_path = file_info.path.replace(self.connection_config.path, "")
152
- if rel_path.startswith("/"):
153
- rel_path = rel_path[1:]
154
- filename = Path(file_info.path).name
155
- yield FileData(
156
- identifier=file_info.path,
157
- connector_type=CONNECTOR_TYPE,
158
- source_identifiers=SourceIdentifiers(
159
- filename=filename,
160
- rel_path=rel_path,
161
- fullpath=file_info.path,
162
- ),
163
- additional_metadata={
164
- "catalog": self.connection_config.catalog,
165
- },
166
- metadata=FileDataSourceMetadata(
167
- url=file_info.path, date_modified=str(file_info.modification_time)
168
- ),
169
- )
170
-
171
-
172
- @dataclass
173
- class DatabricksVolumesDownloaderConfig(DownloaderConfig):
174
- pass
175
-
176
-
177
- @dataclass
178
- class DatabricksVolumesDownloader(Downloader):
179
- download_config: DatabricksVolumesDownloaderConfig
180
- connection_config: DatabricksVolumesConnectionConfig
181
- connector_type: str = CONNECTOR_TYPE
182
-
183
- def precheck(self) -> None:
184
- try:
185
- self.connection_config.get_client()
186
- except Exception as e:
187
- logger.error(f"failed to validate connection: {e}", exc_info=True)
188
- raise SourceConnectionError(f"failed to validate connection: {e}")
189
-
190
- def get_download_path(self, file_data: FileData) -> Path:
191
- return self.download_config.download_dir / Path(file_data.source_identifiers.relative_path)
192
-
193
- def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
194
- download_path = self.get_download_path(file_data=file_data)
195
- download_path.parent.mkdir(parents=True, exist_ok=True)
196
- logger.info(f"Writing {file_data.identifier} to {download_path}")
197
- try:
198
- with self.connection_config.get_client().dbfs.download(path=file_data.identifier) as c:
199
- read_content = c._read_handle.read()
200
- with open(download_path, "wb") as f:
201
- f.write(read_content)
202
- except Exception as e:
203
- logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
204
- raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
205
-
206
- return self.generate_download_response(file_data=file_data, download_path=download_path)
207
-
208
-
209
- class DatabricksVolumesUploaderConfig(UploaderConfig):
210
- overwrite: bool = Field(
211
- default=False, description="If true, an existing file will be overwritten."
212
- )
213
-
214
-
215
- @dataclass
216
- class DatabricksVolumesUploader(Uploader):
217
- upload_config: DatabricksVolumesUploaderConfig
218
- connection_config: DatabricksVolumesConnectionConfig
219
- connector_type: str = CONNECTOR_TYPE
220
-
221
- def precheck(self) -> None:
222
- try:
223
- assert self.connection_config.get_client().current_user.me().active
224
- except Exception as e:
225
- logger.error(f"failed to validate connection: {e}", exc_info=True)
226
- raise DestinationConnectionError(f"failed to validate connection: {e}")
227
-
228
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
229
- output_path = os.path.join(self.connection_config.path, path.name)
230
- with open(path, "rb") as elements_file:
231
- self.connection_config.get_client().files.upload(
232
- file_path=output_path,
233
- contents=elements_file,
234
- overwrite=self.upload_config.overwrite,
235
- )
236
-
237
-
238
- databricks_volumes_destination_entry = DestinationRegistryEntry(
239
- connection_config=DatabricksVolumesConnectionConfig,
240
- uploader=DatabricksVolumesUploader,
241
- uploader_config=DatabricksVolumesUploaderConfig,
242
- )
243
-
244
- databricks_volumes_source_entry = SourceRegistryEntry(
245
- connection_config=DatabricksVolumesConnectionConfig,
246
- indexer=DatabricksVolumesIndexer,
247
- indexer_config=DatabricksVolumesIndexerConfig,
248
- downloader=DatabricksVolumesDownloader,
249
- downloader_config=DatabricksVolumesDownloaderConfig,
250
- )