unstructured-ingest 0.0.2__py3-none-any.whl → 0.0.2.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (37) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/v2/cli/base/cmd.py +0 -10
  3. unstructured_ingest/v2/cli/base/src.py +0 -2
  4. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +9 -1
  5. unstructured_ingest/v2/cli/cmds/local.py +8 -0
  6. unstructured_ingest/v2/cli/configs/__init__.py +1 -8
  7. unstructured_ingest/v2/interfaces/__init__.py +1 -2
  8. unstructured_ingest/v2/interfaces/downloader.py +3 -9
  9. unstructured_ingest/v2/interfaces/file_data.py +1 -6
  10. unstructured_ingest/v2/interfaces/process.py +0 -3
  11. unstructured_ingest/v2/pipeline/interfaces.py +5 -3
  12. unstructured_ingest/v2/pipeline/pipeline.py +2 -72
  13. unstructured_ingest/v2/pipeline/steps/download.py +13 -77
  14. unstructured_ingest/v2/processes/connectors/astra.py +0 -8
  15. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +0 -8
  16. unstructured_ingest/v2/processes/connectors/chroma.py +6 -8
  17. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +0 -9
  18. unstructured_ingest/v2/processes/connectors/elasticsearch.py +9 -23
  19. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +27 -12
  20. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +5 -13
  21. unstructured_ingest/v2/processes/connectors/google_drive.py +9 -13
  22. unstructured_ingest/v2/processes/connectors/local.py +15 -15
  23. unstructured_ingest/v2/processes/connectors/mongodb.py +4 -10
  24. unstructured_ingest/v2/processes/connectors/onedrive.py +2 -14
  25. unstructured_ingest/v2/processes/connectors/pinecone.py +3 -6
  26. unstructured_ingest/v2/processes/connectors/salesforce.py +8 -10
  27. unstructured_ingest/v2/processes/connectors/sharepoint.py +8 -14
  28. unstructured_ingest/v2/processes/connectors/sql.py +9 -24
  29. unstructured_ingest/v2/processes/connectors/weaviate.py +5 -13
  30. {unstructured_ingest-0.0.2.dist-info → unstructured_ingest-0.0.2.dev0.dist-info}/METADATA +15 -15
  31. {unstructured_ingest-0.0.2.dist-info → unstructured_ingest-0.0.2.dev0.dist-info}/RECORD +34 -37
  32. unstructured_ingest/v2/cli/configs/filter.py +0 -28
  33. unstructured_ingest/v2/pipeline/steps/filter.py +0 -40
  34. unstructured_ingest/v2/processes/filter.py +0 -54
  35. {unstructured_ingest-0.0.2.dist-info → unstructured_ingest-0.0.2.dev0.dist-info}/WHEEL +0 -0
  36. {unstructured_ingest-0.0.2.dist-info → unstructured_ingest-0.0.2.dev0.dist-info}/entry_points.txt +0 -0
  37. {unstructured_ingest-0.0.2.dist-info → unstructured_ingest-0.0.2.dev0.dist-info}/top_level.txt +0 -0
@@ -4,14 +4,13 @@ import uuid
4
4
  from dataclasses import dataclass, field
5
5
  from datetime import date, datetime
6
6
  from pathlib import Path
7
- from typing import TYPE_CHECKING, Any, Callable, Optional, Union
7
+ from typing import Any, Optional, Union
8
8
 
9
9
  import numpy as np
10
10
  import pandas as pd
11
11
  from dateutil import parser
12
12
 
13
13
  from unstructured_ingest.enhanced_dataclass import enhanced_field
14
- from unstructured_ingest.error import DestinationConnectionError
15
14
  from unstructured_ingest.utils.dep_check import requires_dependencies
16
15
  from unstructured_ingest.v2.interfaces import (
17
16
  AccessConfig,
@@ -26,11 +25,6 @@ from unstructured_ingest.v2.interfaces import (
26
25
  from unstructured_ingest.v2.logger import logger
27
26
  from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
28
27
 
29
- if TYPE_CHECKING:
30
- from sqlite3 import Connection as SqliteConnection
31
-
32
- from psycopg2.extensions import connection as PostgresConnection
33
-
34
28
  CONNECTOR_TYPE = "sql"
35
29
  ELEMENTS_TABLE_NAME = "elements"
36
30
 
@@ -47,7 +41,7 @@ class DatabaseType(str, enum.Enum):
47
41
 
48
42
 
49
43
  @dataclass
50
- class SQLConnectionConfig(ConnectionConfig):
44
+ class SimpleSqlConfig(ConnectionConfig):
51
45
  db_type: DatabaseType = (
52
46
  # required default value here because of parent class
53
47
  DatabaseType.SQLITE
@@ -140,7 +134,7 @@ class SQLUploadStager(UploadStager):
140
134
  **kwargs: Any,
141
135
  ) -> Path:
142
136
  with open(elements_filepath) as elements_file:
143
- elements_contents: list[dict] = json.load(elements_file)
137
+ elements_contents = json.load(elements_file)
144
138
  output_path = Path(output_dir) / Path(f"{output_filename}.json")
145
139
  output_path.parent.mkdir(parents=True, exist_ok=True)
146
140
 
@@ -157,7 +151,7 @@ class SQLUploadStager(UploadStager):
157
151
  data["id"] = str(uuid.uuid4())
158
152
 
159
153
  # remove extraneous, not supported columns
160
- data = {k: v for k, v in data.items() if k in _COLUMNS}
154
+ [data.pop(column) for column in data if column not in _COLUMNS]
161
155
 
162
156
  output.append(data)
163
157
 
@@ -191,32 +185,23 @@ class SQLUploaderConfig(UploaderConfig):
191
185
  class SQLUploader(Uploader):
192
186
  connector_type: str = CONNECTOR_TYPE
193
187
  upload_config: SQLUploaderConfig
194
- connection_config: SQLConnectionConfig
195
-
196
- def precheck(self) -> None:
197
- try:
198
- cursor = self.connection().cursor()
199
- cursor.execute("SELECT 1;")
200
- cursor.close()
201
- except Exception as e:
202
- logger.error(f"failed to validate connection: {e}", exc_info=True)
203
- raise DestinationConnectionError(f"failed to validate connection: {e}")
188
+ connection_config: SimpleSqlConfig
204
189
 
205
190
  @property
206
- def connection(self) -> Callable[[], Union["SqliteConnection", "PostgresConnection"]]:
191
+ def connection(self):
207
192
  if self.connection_config.db_type == DatabaseType.POSTGRESQL:
208
193
  return self._make_psycopg_connection
209
194
  elif self.connection_config.db_type == DatabaseType.SQLITE:
210
195
  return self._make_sqlite_connection
211
196
  raise ValueError(f"Unsupported database {self.connection_config.db_type} connection.")
212
197
 
213
- def _make_sqlite_connection(self) -> "SqliteConnection":
198
+ def _make_sqlite_connection(self):
214
199
  from sqlite3 import connect
215
200
 
216
201
  return connect(database=self.connection_config.database)
217
202
 
218
203
  @requires_dependencies(["psycopg2"], extras="postgres")
219
- def _make_psycopg_connection(self) -> "PostgresConnection":
204
+ def _make_psycopg_connection(self):
220
205
  from psycopg2 import connect
221
206
 
222
207
  return connect(
@@ -276,7 +261,7 @@ class SQLUploader(Uploader):
276
261
 
277
262
 
278
263
  sql_destination_entry = DestinationRegistryEntry(
279
- connection_config=SQLConnectionConfig,
264
+ connection_config=SimpleSqlConfig,
280
265
  uploader=SQLUploader,
281
266
  uploader_config=SQLUploaderConfig,
282
267
  upload_stager=SQLUploadStager,
@@ -7,7 +7,6 @@ from typing import TYPE_CHECKING, Any, Optional
7
7
  from dateutil import parser
8
8
 
9
9
  from unstructured_ingest.enhanced_dataclass import enhanced_field
10
- from unstructured_ingest.error import DestinationConnectionError
11
10
  from unstructured_ingest.utils.dep_check import requires_dependencies
12
11
  from unstructured_ingest.v2.interfaces import (
13
12
  AccessConfig,
@@ -157,21 +156,15 @@ class WeaviateUploaderConfig(UploaderConfig):
157
156
  class WeaviateUploader(Uploader):
158
157
  upload_config: WeaviateUploaderConfig
159
158
  connection_config: WeaviateConnectionConfig
159
+ client: Optional["Client"] = field(init=False)
160
160
  connector_type: str = CONNECTOR_TYPE
161
161
 
162
162
  @requires_dependencies(["weaviate"], extras="weaviate")
163
- def get_client(self) -> "Client":
163
+ def __post_init__(self):
164
164
  from weaviate import Client
165
165
 
166
166
  auth = self._resolve_auth_method()
167
- return Client(url=self.connection_config.host_url, auth_client_secret=auth)
168
-
169
- def precheck(self) -> None:
170
- try:
171
- self.get_client()
172
- except Exception as e:
173
- logger.error(f"Failed to validate connection {e}", exc_info=True)
174
- raise DestinationConnectionError(f"failed to validate connection: {e}")
167
+ self.client = Client(url=self.connection_config.host_url, auth_client_secret=auth)
175
168
 
176
169
  @requires_dependencies(["weaviate"], extras="weaviate")
177
170
  def _resolve_auth_method(self):
@@ -222,9 +215,8 @@ class WeaviateUploader(Uploader):
222
215
  f"at {self.connection_config.host_url}",
223
216
  )
224
217
 
225
- client = self.get_client()
226
- client.batch.configure(batch_size=self.upload_config.batch_size)
227
- with client.batch as b:
218
+ self.client.batch.configure(batch_size=self.upload_config.batch_size)
219
+ with self.client.batch as b:
228
220
  for e in elements_dict:
229
221
  vector = e.pop("embeddings", None)
230
222
  b.add_data_object(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.0.2
3
+ Version: 0.0.2.dev0
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -29,8 +29,8 @@ Requires-Dist: pyairtable ; extra == 'airtable'
29
29
  Provides-Extra: astra
30
30
  Requires-Dist: astrapy ; extra == 'astra'
31
31
  Provides-Extra: azure
32
- Requires-Dist: fsspec ; extra == 'azure'
33
32
  Requires-Dist: adlfs ; extra == 'azure'
33
+ Requires-Dist: fsspec ; extra == 'azure'
34
34
  Provides-Extra: azure-cognitive-search
35
35
  Requires-Dist: azure-search-documents ; extra == 'azure-cognitive-search'
36
36
  Provides-Extra: bedrock
@@ -42,9 +42,9 @@ Provides-Extra: box
42
42
  Requires-Dist: boxfs ; extra == 'box'
43
43
  Requires-Dist: fsspec ; extra == 'box'
44
44
  Provides-Extra: chroma
45
- Requires-Dist: importlib-metadata >=7.1.0 ; extra == 'chroma'
46
45
  Requires-Dist: typer <=0.9.0 ; extra == 'chroma'
47
46
  Requires-Dist: chromadb ; extra == 'chroma'
47
+ Requires-Dist: importlib-metadata >=7.1.0 ; extra == 'chroma'
48
48
  Provides-Extra: clarifai
49
49
  Requires-Dist: clarifai ; extra == 'clarifai'
50
50
  Provides-Extra: confluence
@@ -54,8 +54,8 @@ Requires-Dist: unstructured[tsv] ; extra == 'csv'
54
54
  Provides-Extra: databricks-volumes
55
55
  Requires-Dist: databricks-sdk ; extra == 'databricks-volumes'
56
56
  Provides-Extra: delta-table
57
- Requires-Dist: deltalake ; extra == 'delta-table'
58
57
  Requires-Dist: fsspec ; extra == 'delta-table'
58
+ Requires-Dist: deltalake ; extra == 'delta-table'
59
59
  Provides-Extra: discord
60
60
  Requires-Dist: discord-py ; extra == 'discord'
61
61
  Provides-Extra: doc
@@ -63,21 +63,21 @@ Requires-Dist: unstructured[docx] ; extra == 'doc'
63
63
  Provides-Extra: docx
64
64
  Requires-Dist: unstructured[docx] ; extra == 'docx'
65
65
  Provides-Extra: dropbox
66
- Requires-Dist: dropboxdrivefs ; extra == 'dropbox'
67
66
  Requires-Dist: fsspec ; extra == 'dropbox'
67
+ Requires-Dist: dropboxdrivefs ; extra == 'dropbox'
68
68
  Provides-Extra: elasticsearch
69
69
  Requires-Dist: elasticsearch[async] ; extra == 'elasticsearch'
70
70
  Provides-Extra: embed-huggingface
71
+ Requires-Dist: langchain-community ; extra == 'embed-huggingface'
71
72
  Requires-Dist: huggingface ; extra == 'embed-huggingface'
72
73
  Requires-Dist: sentence-transformers ; extra == 'embed-huggingface'
73
- Requires-Dist: langchain-community ; extra == 'embed-huggingface'
74
74
  Provides-Extra: embed-octoai
75
75
  Requires-Dist: tiktoken ; extra == 'embed-octoai'
76
76
  Requires-Dist: openai ; extra == 'embed-octoai'
77
77
  Provides-Extra: embed-vertexai
78
- Requires-Dist: langchain-community ; extra == 'embed-vertexai'
79
78
  Requires-Dist: langchain ; extra == 'embed-vertexai'
80
79
  Requires-Dist: langchain-google-vertexai ; extra == 'embed-vertexai'
80
+ Requires-Dist: langchain-community ; extra == 'embed-vertexai'
81
81
  Provides-Extra: embed-voyageai
82
82
  Requires-Dist: langchain ; extra == 'embed-voyageai'
83
83
  Requires-Dist: langchain-voyageai ; extra == 'embed-voyageai'
@@ -85,8 +85,8 @@ Provides-Extra: epub
85
85
  Requires-Dist: unstructured[epub] ; extra == 'epub'
86
86
  Provides-Extra: gcs
87
87
  Requires-Dist: bs4 ; extra == 'gcs'
88
- Requires-Dist: gcsfs ; extra == 'gcs'
89
88
  Requires-Dist: fsspec ; extra == 'gcs'
89
+ Requires-Dist: gcsfs ; extra == 'gcs'
90
90
  Provides-Extra: github
91
91
  Requires-Dist: pygithub >1.58.0 ; extra == 'github'
92
92
  Provides-Extra: gitlab
@@ -109,25 +109,25 @@ Requires-Dist: pymongo ; extra == 'mongodb'
109
109
  Provides-Extra: msg
110
110
  Requires-Dist: unstructured[msg] ; extra == 'msg'
111
111
  Provides-Extra: notion
112
- Requires-Dist: htmlBuilder ; extra == 'notion'
113
112
  Requires-Dist: notion-client ; extra == 'notion'
113
+ Requires-Dist: htmlBuilder ; extra == 'notion'
114
114
  Provides-Extra: odt
115
115
  Requires-Dist: unstructured[odt] ; extra == 'odt'
116
116
  Provides-Extra: onedrive
117
+ Requires-Dist: Office365-REST-Python-Client ; extra == 'onedrive'
117
118
  Requires-Dist: msal ; extra == 'onedrive'
118
119
  Requires-Dist: bs4 ; extra == 'onedrive'
119
- Requires-Dist: Office365-REST-Python-Client ; extra == 'onedrive'
120
120
  Provides-Extra: openai
121
- Requires-Dist: tiktoken ; extra == 'openai'
122
- Requires-Dist: openai ; extra == 'openai'
123
121
  Requires-Dist: langchain-community ; extra == 'openai'
122
+ Requires-Dist: openai ; extra == 'openai'
123
+ Requires-Dist: tiktoken ; extra == 'openai'
124
124
  Provides-Extra: opensearch
125
125
  Requires-Dist: opensearch-py ; extra == 'opensearch'
126
126
  Provides-Extra: org
127
127
  Requires-Dist: unstructured[org] ; extra == 'org'
128
128
  Provides-Extra: outlook
129
- Requires-Dist: msal ; extra == 'outlook'
130
129
  Requires-Dist: Office365-REST-Python-Client ; extra == 'outlook'
130
+ Requires-Dist: msal ; extra == 'outlook'
131
131
  Provides-Extra: pdf
132
132
  Requires-Dist: unstructured[pdf] ; extra == 'pdf'
133
133
  Provides-Extra: pinecone
@@ -152,11 +152,11 @@ Requires-Dist: fsspec ; extra == 's3'
152
152
  Provides-Extra: salesforce
153
153
  Requires-Dist: simple-salesforce ; extra == 'salesforce'
154
154
  Provides-Extra: sftp
155
- Requires-Dist: paramiko ; extra == 'sftp'
156
155
  Requires-Dist: fsspec ; extra == 'sftp'
156
+ Requires-Dist: paramiko ; extra == 'sftp'
157
157
  Provides-Extra: sharepoint
158
- Requires-Dist: msal ; extra == 'sharepoint'
159
158
  Requires-Dist: Office365-REST-Python-Client ; extra == 'sharepoint'
159
+ Requires-Dist: msal ; extra == 'sharepoint'
160
160
  Provides-Extra: singlestore
161
161
  Requires-Dist: singlestoredb ; extra == 'singlestore'
162
162
  Provides-Extra: slack
@@ -1,5 +1,5 @@
1
1
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
2
- unstructured_ingest/__version__.py,sha256=t0CFzEk7qlIWbgyEWA53ytTKmHbZ9ow2lAyjeP1bFqw,42
2
+ unstructured_ingest/__version__.py,sha256=neOBPct_gjgXqs6YN8HnzdaRiPiugEbJpwI6SDZ7qac,47
3
3
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
4
4
  unstructured_ingest/evaluate.py,sha256=R-mKLFXbVX1xQ1tjGsLHjdP-TbSSV-925IHzggW_bIg,9793
5
5
  unstructured_ingest/interfaces.py,sha256=uS8L5mS0mXD8I4XTfVlKZxAwqnpJ4yrRqn4vxWVRhQI,31107
@@ -260,10 +260,10 @@ unstructured_ingest/v2/cli/cli.py,sha256=qHXIs-PcvMgDZhP1AR9iDMxh8FXBMJCEDksPBfi
260
260
  unstructured_ingest/v2/cli/interfaces.py,sha256=4Bbow6QHks2a1H56tmVQ4vG3sZy-577ZbwrPmDfizmE,829
261
261
  unstructured_ingest/v2/cli/utils.py,sha256=QK-ee6FzxPf-IbaNXXWlH-GhvqeNnjK2m8ljBD1SusU,9075
262
262
  unstructured_ingest/v2/cli/base/__init__.py,sha256=zXCa7F4FMqItmzxfUIVmyI-CeGh8X85yF8lRxwX_OYQ,83
263
- unstructured_ingest/v2/cli/base/cmd.py,sha256=qi9N5rcyyE2nmswFaoKWbs1PonhHsMC5llqND9-rQso,9790
263
+ unstructured_ingest/v2/cli/base/cmd.py,sha256=qVHmquVsVDoYyPByKdUTVCwAFfILMYBw5w6eTTVku-E,9308
264
264
  unstructured_ingest/v2/cli/base/dest.py,sha256=YMbVIHmYDqvOtxZeEY93stmF2p2ImjuJts7-u-NznYw,2887
265
265
  unstructured_ingest/v2/cli/base/importer.py,sha256=nRt0QQ3qpi264-n_mR0l55C2ddM8nowTNzT1jsWaam8,1128
266
- unstructured_ingest/v2/cli/base/src.py,sha256=oUPO9GPEbkYm1udfD4YQBYTfaefbhpoIN1HPnD672SQ,2460
266
+ unstructured_ingest/v2/cli/base/src.py,sha256=7LnZh9FgUX9rerBH6cizVtTWmM6R2sRkxatnGsxYHG0,2410
267
267
  unstructured_ingest/v2/cli/cmds/__init__.py,sha256=DWPMD6Wqus22sSoIEyTSiOJAm97aNjvdpdrXgsL4uQ0,2647
268
268
  unstructured_ingest/v2/cli/cmds/astra.py,sha256=L-GR2KSP_cFQkQm0aVcdiXmgYMJZCVKIAH794y8qT1M,2590
269
269
  unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py,sha256=VTCSUYeIYKnP60lC7DeBYqoqAJnWuBZrwevCXbeIEzw,2248
@@ -271,7 +271,7 @@ unstructured_ingest/v2/cli/cmds/chroma.py,sha256=RinNOPripk2zRYx1Rt-u-jywXbwh7Js
271
271
  unstructured_ingest/v2/cli/cmds/databricks_volumes.py,sha256=53d9A7UunJLYZFwwwHEraVshFc3gSzUbmKjMOiv7hn4,5920
272
272
  unstructured_ingest/v2/cli/cmds/elasticsearch.py,sha256=joUfnV992fAwEDCtFVJaABwgpyQiWeDl1ZCBEudRtnk,5258
273
273
  unstructured_ingest/v2/cli/cmds/google_drive.py,sha256=mXozabpi8kjRFb0S7kw-xMGtEuFoVUxnvefwL5ZIPHc,2334
274
- unstructured_ingest/v2/cli/cmds/local.py,sha256=UOTYjSdNqCFxhZfN6bdxm8jRp6Ijun2K-WpQq1X83OQ,1544
274
+ unstructured_ingest/v2/cli/cmds/local.py,sha256=lGBFOVDRlrcCtPFjyk0IAYHLRWg95Kunu1Kli7t0ZK4,1899
275
275
  unstructured_ingest/v2/cli/cmds/milvus.py,sha256=PB1ib1rFGGH_-KDi1bSIO3BIiVcqSJEHCBFFrzQrnmI,1998
276
276
  unstructured_ingest/v2/cli/cmds/mongodb.py,sha256=oyV6tacuuxm3dN-AXQgbxvYJiDYo2OOWQKRSBCUGj0E,1823
277
277
  unstructured_ingest/v2/cli/cmds/onedrive.py,sha256=DKqhQyyF-swZxs3C9G5W8ECleq8sWpDbpTuiAHXukXQ,2781
@@ -286,34 +286,32 @@ unstructured_ingest/v2/cli/cmds/fsspec/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JC
286
286
  unstructured_ingest/v2/cli/cmds/fsspec/azure.py,sha256=ZHfchzSpGkZ99Fq1050JvHP0-aG1pZsBZxxozcFfxwI,2784
287
287
  unstructured_ingest/v2/cli/cmds/fsspec/box.py,sha256=kslkI-0-GyGSJOU7bKgrZeQRXh8HFexDq87ew8kT8kE,1338
288
288
  unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py,sha256=LtcR3rCQPgzJNbV3S90HlL0LPPbW9lYEfE8BG4F-dSI,1349
289
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py,sha256=BlJLEu6YJUejLLFzlSCVSoZDp2pdjoTsdoFFHVwwkVY,2031
289
+ unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py,sha256=Mgr_nto1FWxnGmbGdVlIfm-xuBGL0HEi8k3FEmQnZng,2414
290
290
  unstructured_ingest/v2/cli/cmds/fsspec/gcs.py,sha256=3-0LYnDs0fgNrDqnHpNZKj_6rwNj9wQVaV0lGOhVFPE,2737
291
291
  unstructured_ingest/v2/cli/cmds/fsspec/s3.py,sha256=EXQzYkDtkFli2sfcj4cRDRPFac7b7z1DfQqYlGQcE6o,2279
292
292
  unstructured_ingest/v2/cli/cmds/fsspec/sftp.py,sha256=YY2xKguawMyLdcG0qDYKUgk7DT0KgyZJlV17MfwIhpo,2036
293
- unstructured_ingest/v2/cli/configs/__init__.py,sha256=nAJ1gT3yCAzoZbEbYswE2SMfSk7TEPxa_1v3qEUsgIQ,336
293
+ unstructured_ingest/v2/cli/configs/__init__.py,sha256=5NMXm872QQZTvUFZFS06c8c1b6K940K5gxs9lbp8W6M,258
294
294
  unstructured_ingest/v2/cli/configs/chunk.py,sha256=KvIhmIRIZxazCumMztAKdWs-4MK7qzOb5h6Ned_2bdU,3547
295
295
  unstructured_ingest/v2/cli/configs/embed.py,sha256=q_TwnkxKTKOsMgVYfW6xxbD8FWjU_Uh_X2BQ5-_VLGM,2725
296
- unstructured_ingest/v2/cli/configs/filter.py,sha256=KYe65_au6m7H4VrjgugC2ain6vsUSWswNSEgcG66VPU,841
297
296
  unstructured_ingest/v2/cli/configs/partition.py,sha256=7wdI18V6c4kaXuf50Lh66n9LbtrYHYd8ffEgDQLqvSk,3931
298
297
  unstructured_ingest/v2/cli/configs/processor.py,sha256=ZHu2DBIuE8VgL3mEt73yYimw2k_PaOEtdxxFqzHfk84,3350
299
- unstructured_ingest/v2/interfaces/__init__.py,sha256=Rfa8crx6De7WNOK-EjsWWwFVpsUfCc6gY8B8tQ3ae9I,899
298
+ unstructured_ingest/v2/interfaces/__init__.py,sha256=-CHWUlT4rISd-gSfcGKGYFqqSFhMY9lKsT5wxwmOThM,845
300
299
  unstructured_ingest/v2/interfaces/connector.py,sha256=u4hE1DpTPDC04-n_IzYyn9w1gNCiPT81anrUoEh30Z8,855
301
- unstructured_ingest/v2/interfaces/downloader.py,sha256=zs7cxhzbWVc5L0bV4gdCTexWGMVeXTQ9jJF6PCYSAss,2790
302
- unstructured_ingest/v2/interfaces/file_data.py,sha256=PZrPJBkNC63lNO_1nwvnAeKRxjM3CsjIY6jSO8T9bVM,1665
300
+ unstructured_ingest/v2/interfaces/downloader.py,sha256=aWlacZZrI6SGw6retnRJtZbqT5voOYq_fb326ynNOhI,2506
301
+ unstructured_ingest/v2/interfaces/file_data.py,sha256=5TCMkblUW-Jvy-rS5FqRT22VzDmJqAiQRIWYarpAi64,1543
303
302
  unstructured_ingest/v2/interfaces/indexer.py,sha256=pMw0abNHk_tEuA4BkXX1BdAfIwHdytxj7s6tGxMvYRE,821
304
- unstructured_ingest/v2/interfaces/process.py,sha256=_l4dyaM0u0XxTqQw1Ghr8k2QMpQJMFapLOLhWqSdTdo,512
303
+ unstructured_ingest/v2/interfaces/process.py,sha256=0ecz7mAjlY_DUi9-HhPc9zXphmGclispYwv37O8gvJ0,466
305
304
  unstructured_ingest/v2/interfaces/processor.py,sha256=uHVHeKo5Gt_zFkaEXw7xgaCBDTEl2-Amh-ByA07258o,1620
306
305
  unstructured_ingest/v2/interfaces/upload_stager.py,sha256=SylhDl9pK6qa7hvfrhpabCkjwE03yIlI6oM-mQnqtho,1220
307
306
  unstructured_ingest/v2/interfaces/uploader.py,sha256=bzfx3Ei4poXKu-hsgjAB4sj4jKij9CoaRSadUM5LtGk,1083
308
307
  unstructured_ingest/v2/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
309
- unstructured_ingest/v2/pipeline/interfaces.py,sha256=Z50-6XFZNajfmJbLKunLxw3RuYMzCYiUp6F0jhQwERE,6441
310
- unstructured_ingest/v2/pipeline/pipeline.py,sha256=dqn4_O4il6gZ33mE0DVC1wQKRcXMrD_jll999NoyQgw,14283
308
+ unstructured_ingest/v2/pipeline/interfaces.py,sha256=Zz76fLHNKw6BDsBSYQXiRa6CvyW91ulvZU0yw5vVQ5M,6544
309
+ unstructured_ingest/v2/pipeline/pipeline.py,sha256=r8jRMZI2RF8GQIuTcjIFBDeFtMnqpOJmKhEriy6Vo5Y,11616
311
310
  unstructured_ingest/v2/pipeline/utils.py,sha256=oPAitfdnITqh2O8Z0uf6VOHg9BTJhitRzNmKXqTwPxg,422
312
311
  unstructured_ingest/v2/pipeline/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
313
312
  unstructured_ingest/v2/pipeline/steps/chunk.py,sha256=lfCsBo6A9u1cT57YaEjvNI79gc29nW8c-2_WZNjiO5Y,3275
314
- unstructured_ingest/v2/pipeline/steps/download.py,sha256=qYeuRU5jeICyuTN7E4YUdnbi6X1X2qKoooJMm4Orbdw,7499
313
+ unstructured_ingest/v2/pipeline/steps/download.py,sha256=GA5-zTH4c7Ac8oBoc4AeDA8sQ0dYT-KUOZ4n31K3Jpg,4882
315
314
  unstructured_ingest/v2/pipeline/steps/embed.py,sha256=VCdDBUXK6Yx8RTvRBpEFdFE7n0izvkP73w6s8Tv2sgg,3253
316
- unstructured_ingest/v2/pipeline/steps/filter.py,sha256=mYVccl_zp0CGYFWBrSrPelvSElrXhZahebuymGuirV8,1341
317
315
  unstructured_ingest/v2/pipeline/steps/index.py,sha256=i4RcJ1oRqNp-rFdc6rvKVGcSzNhdB7woW7_W364uThQ,2269
318
316
  unstructured_ingest/v2/pipeline/steps/partition.py,sha256=q7-rpCj5Vy4BXtd7T72gxGb3xg6lmVyNmTwUfHil7Rg,3199
319
317
  unstructured_ingest/v2/pipeline/steps/stage.py,sha256=A8i6VAFY4_xFJR0uBEyBNJlQXmTMGaflXsa6Wa6U1wQ,2274
@@ -323,39 +321,38 @@ unstructured_ingest/v2/processes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
323
321
  unstructured_ingest/v2/processes/chunker.py,sha256=U6zQhaUG_dii66zqL9iEEGodHENNxnpn6V3pC-e7MMI,4233
324
322
  unstructured_ingest/v2/processes/connector_registry.py,sha256=KOrvJNNRdpBPyqFwmTm42kD1xXuo7fNS_5yXjtqAz-c,2100
325
323
  unstructured_ingest/v2/processes/embedder.py,sha256=QjAsiXAjWtZzh6lJ4D5LsTMBD81zuMBkegXNWq-FZt0,3308
326
- unstructured_ingest/v2/processes/filter.py,sha256=CfQihLV_r4yTJgAc66mmbP4_xo3wcDlro5oR_KR--bM,1986
327
324
  unstructured_ingest/v2/processes/partitioner.py,sha256=f6UQoQHVKjl8rmM5J9EcuP30RTFLSLrArGdC6qh-ffE,7645
328
325
  unstructured_ingest/v2/processes/uncompress.py,sha256=x-JZYNs1zJOtRS7xNgiMyrYoAbzKM0p18O8NAl7avCA,1631
329
326
  unstructured_ingest/v2/processes/connectors/__init__.py,sha256=7QMKd8gtEJTIuK352Ho6XyoFvLLhrWIzgdu0dXwXWOE,3960
330
- unstructured_ingest/v2/processes/connectors/astra.py,sha256=m6A34wYjnctRfIF-14bnbGIFBwht5Y8UWZ4g8R9x6a8,5241
331
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py,sha256=N_--5t_hxFzFEK4vERzm46gfg-Ghozb71_NmUlEYIMA,8277
332
- unstructured_ingest/v2/processes/connectors/chroma.py,sha256=W995GLn7D85GoUhSqHQXP5QQ8OglgykA5rcNmg9Ruf4,7158
333
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py,sha256=8bVht450bxp0K4ub1XdCDmMKEooXVV4DNY5b5GWF0Ig,3636
334
- unstructured_ingest/v2/processes/connectors/elasticsearch.py,sha256=myY2FRXtlBYhH-kbTSsn7j9UDzh36NYHqFRP-ys8am4,15358
335
- unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=-iYpwt4xxaICRlHD5Bpap7Ck5HRJcapa6uHl60E1uZ4,12702
336
- unstructured_ingest/v2/processes/connectors/local.py,sha256=IJ5DjASp-5lPmb6J7Y8NROYjIS3sfdRhlcDAZEEGVAw,6573
327
+ unstructured_ingest/v2/processes/connectors/astra.py,sha256=TSI_3GHnEh3gYAC30RTG4b2eEB07agroEFmJ38GnQY4,4903
328
+ unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py,sha256=PT02ZKiJuHMrmBClxqBsyDS0aXUQYLVg02Ns2qh1hD4,7935
329
+ unstructured_ingest/v2/processes/connectors/chroma.py,sha256=nYzNz-8oq-DN0Z4r7lHQFmlved76IaYeRvm7-EmbGUE,6998
330
+ unstructured_ingest/v2/processes/connectors/databricks_volumes.py,sha256=MTLK7SvQqWU-PXmEbGajM4f-CqGWlmlfeED6a5StEWw,3226
331
+ unstructured_ingest/v2/processes/connectors/elasticsearch.py,sha256=6QBvVzPk3mWj9ZqJZN7NvhcJaOO6nSLqLwU6zggP59A,14864
332
+ unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=IkLVafUu280OOoqYmdfdfMB6zlpiWjs2Z5J31ZzJOj4,12681
333
+ unstructured_ingest/v2/processes/connectors/local.py,sha256=maAXVKpRRXj_jseC6EPLTosMgw6ll-0lnGsDdAFLWAE,6646
337
334
  unstructured_ingest/v2/processes/connectors/milvus.py,sha256=FWH4FH-zns7gh8sITg9pLYE9uKm_3GeOXJ4wjY6PMno,6776
338
- unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=XZCgkF28HCR4DtMmr8jlxb59txXgEvfCabovROUrv6Y,4602
339
- unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=_TFO-vlyCxIxMk6hv20CEsicrlh87wCrbi4I1chsMUw,8822
335
+ unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=ErZWAD-su3OCRGv1h84X1PpAWleUPVZcFDEIYjtyP4E,4310
336
+ unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=WDDoFEfd8M_QBTpkGNI2zZGZZ_CR1rQiCsBWYOO2JoA,8311
340
337
  unstructured_ingest/v2/processes/connectors/opensearch.py,sha256=HNRZVQsWnjLLm0yAGiIyHRbhAsBnGSXBO_VkUfIdwdE,5463
341
- unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=0rpOh_pi4GormyIQsnEJbKVb7FeizAbLcbljpnjtpeY,5908
342
- unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=S0dEjT1UxReCC6qE9DlSQBgcSzQbOaIq7SMJqXUpNWQ,10858
343
- unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=NRn0lbOuXqIYqZT15IVFeFQCxpCKzZC_M8pVYZeeNfo,17933
338
+ unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=PtAodxemYgiBZESx-g9a8fcL6XagJd9DIDQjrhE8aPk,5746
339
+ unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=Cz4qEtnbsD9-m1DXANxnVRZTHX2ZaUUBPVFPu5wnFRk,10832
340
+ unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=SNovgGUE5tHdfX_lF5zwM_QRZK7mahHzLZKhnqfk6Tc,17696
344
341
  unstructured_ingest/v2/processes/connectors/singlestore.py,sha256=upF2O4hJ2uiBhDRrpQ8CSJUvzmqu2j5H1b_QbReHJpw,5168
345
- unstructured_ingest/v2/processes/connectors/sql.py,sha256=mbhBI2tcX4q1YJwR3Nr7HGbr-rb8ppUYq2JcN88We3U,9076
342
+ unstructured_ingest/v2/processes/connectors/sql.py,sha256=T0rpCbhEipWlezoJOMiUewcZuk6Had6TkmsDT-PeOL0,8360
346
343
  unstructured_ingest/v2/processes/connectors/utils.py,sha256=nmpZZCeX0O7rGrwHSWM_heBgpZK9tKT6EV1Moer-z40,576
347
- unstructured_ingest/v2/processes/connectors/weaviate.py,sha256=HtJuOUhBs_HA7uOXlEIuYtx0elb0ecsCvP8N822tOMQ,8564
344
+ unstructured_ingest/v2/processes/connectors/weaviate.py,sha256=7H3s44zVKbN3_eR35sbKTKSDOt6ZIIQkX-4t65LuJ6c,8254
348
345
  unstructured_ingest/v2/processes/connectors/fsspec/__init__.py,sha256=TtdeImM7Ypl_n6sl7I1JqX6bGSG0t_FqvCqE3Cy24og,1846
349
346
  unstructured_ingest/v2/processes/connectors/fsspec/azure.py,sha256=RN7zoifocIWVgoP9aMDMz4TP-Z9KhE-HbCCBq33fY90,4674
350
347
  unstructured_ingest/v2/processes/connectors/fsspec/box.py,sha256=UnD-F9g7yOOBStrAqeKq6GuQjEyHdwOA3jYLj8YZIRM,4088
351
348
  unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py,sha256=I6mPG9EIso9TcIczCw5Y14Yqd-EhTQ2CLw1MJx1V3dY,4420
352
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=zKrwKTVGnhnitD8h_Url5HRFsJZjM66o3jWrzAm-_UA,12153
349
+ unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=MgOUhDGtTAUuzmAsLvBwV_3ggyL5DDpMm-sb4KNck88,12689
353
350
  unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=RYZq_8hKF7bRxuB5Gozv5AzB3_nTuuooE4UfRjXwEFU,4443
354
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=PXK9a5O3woDuBWSf4R5XLQI5mzHtap8wAKpHI8Rh5gQ,5462
351
+ unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=7lOm5hjb0LBkbe-OWXnV3wDC-3mM_GWwwmdKW0xzh8c,5333
355
352
  unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=J7Ej-j7dtXAluHunwynUfHlNsYwymb-LsrGUFcljcsA,5700
356
353
  unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
357
- unstructured_ingest-0.0.2.dist-info/METADATA,sha256=a68Sz8-m1-ZRFz0p4yic64BhgwTuMdIMmCuPECdhWwA,21568
358
- unstructured_ingest-0.0.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
359
- unstructured_ingest-0.0.2.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
360
- unstructured_ingest-0.0.2.dist-info/top_level.txt,sha256=QaTxTcjfM5Hr9sZJ6weOJvSe5ESQc0F8AWkhHInTCf8,20
361
- unstructured_ingest-0.0.2.dist-info/RECORD,,
354
+ unstructured_ingest-0.0.2.dev0.dist-info/METADATA,sha256=dnWewzRLiYQlU6Fglws3oQnUkgzAVCnKM7BPMtls9YU,21573
355
+ unstructured_ingest-0.0.2.dev0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
356
+ unstructured_ingest-0.0.2.dev0.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
357
+ unstructured_ingest-0.0.2.dev0.dist-info/top_level.txt,sha256=QaTxTcjfM5Hr9sZJ6weOJvSe5ESQc0F8AWkhHInTCf8,20
358
+ unstructured_ingest-0.0.2.dev0.dist-info/RECORD,,
@@ -1,28 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import click
4
-
5
- from unstructured_ingest.v2.cli.interfaces import CliConfig
6
- from unstructured_ingest.v2.cli.utils import DelimitedString
7
-
8
-
9
- @dataclass
10
- class FilterCliConfig(CliConfig):
11
- @staticmethod
12
- def get_cli_options() -> list[click.Option]:
13
- options = [
14
- click.Option(
15
- ["--file-glob"],
16
- default=None,
17
- type=DelimitedString(),
18
- help="A comma-separated list of file globs to limit which types of "
19
- "local files are accepted, e.g. '*.html,*.txt'",
20
- ),
21
- click.Option(
22
- ["--max-file-size"],
23
- default=None,
24
- type=click.IntRange(min=1),
25
- help="Max file size to process in bytes",
26
- ),
27
- ]
28
- return options
@@ -1,40 +0,0 @@
1
- import asyncio
2
- from dataclasses import dataclass
3
- from typing import Callable, Optional
4
-
5
- from unstructured_ingest.v2.interfaces.file_data import FileData
6
- from unstructured_ingest.v2.logger import logger
7
- from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
8
- from unstructured_ingest.v2.pipeline.utils import sterilize_dict
9
- from unstructured_ingest.v2.processes.filter import Filterer
10
-
11
- STEP_ID = "filter"
12
-
13
-
14
- @dataclass
15
- class FilterStep(PipelineStep):
16
- process: Filterer
17
- identifier: str = STEP_ID
18
-
19
- def __post_init__(self):
20
- config = (
21
- sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
22
- if self.process.config
23
- else None
24
- )
25
- logger.info(f"Created {self.identifier} with configs: {config}")
26
-
27
- async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:
28
- file_data = FileData.from_file(path=file_data_path)
29
- fn_kwargs = {"file_data": file_data}
30
- if not asyncio.iscoroutinefunction(fn):
31
- resp = fn(**fn_kwargs)
32
- elif semaphore := self.context.semaphore:
33
- async with semaphore:
34
- resp = await fn(**fn_kwargs)
35
- else:
36
- resp = await fn(**fn_kwargs)
37
-
38
- if resp:
39
- return {"file_data_path": file_data_path}
40
- return None
@@ -1,54 +0,0 @@
1
- import fnmatch
2
- from abc import ABC
3
- from dataclasses import dataclass, field
4
- from typing import Any, Callable, Optional
5
-
6
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
7
- from unstructured_ingest.v2.interfaces import FileData
8
- from unstructured_ingest.v2.interfaces.process import BaseProcess
9
- from unstructured_ingest.v2.logger import logger
10
-
11
-
12
- @dataclass
13
- class FiltererConfig(EnhancedDataClassJsonMixin):
14
- file_glob: Optional[list[str]] = None
15
- max_file_size: Optional[int] = None
16
-
17
-
18
- @dataclass
19
- class Filterer(BaseProcess, ABC):
20
- config: FiltererConfig = field(default_factory=lambda: FiltererConfig())
21
- filters: list[Callable[[FileData], bool]] = field(init=False, default_factory=list)
22
-
23
- def __post_init__(self):
24
- # Populate the filters based on values in config
25
- if self.config.file_glob is not None:
26
- self.filters.append(self.glob_filter)
27
- if self.config.max_file_size:
28
- self.filters.append(self.file_size_filter)
29
-
30
- def is_async(self) -> bool:
31
- return False
32
-
33
- def file_size_filter(self, file_data: FileData) -> bool:
34
- if filesize_bytes := file_data.metadata.filesize_bytes:
35
- return filesize_bytes <= self.config.max_file_size
36
- return True
37
-
38
- def glob_filter(self, file_data: FileData) -> bool:
39
- patterns = self.config.file_glob
40
- path = file_data.source_identifiers.fullpath
41
- for pattern in patterns:
42
- if fnmatch.filter([path], pattern):
43
- return True
44
- logger.debug(f"The file {path!r} is discarded as it does not match any given glob.")
45
- return False
46
-
47
- def run(self, file_data: FileData, **kwargs: Any) -> Optional[FileData]:
48
- for filter in self.filters:
49
- if not filter(file_data):
50
- logger.debug(
51
- f"filtered out file data due to {filter.__name__}: {file_data.identifier}"
52
- )
53
- return None
54
- return file_data