unstructured-ingest 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (37) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/v2/cli/base/cmd.py +10 -0
  3. unstructured_ingest/v2/cli/base/src.py +2 -0
  4. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +1 -9
  5. unstructured_ingest/v2/cli/cmds/local.py +0 -8
  6. unstructured_ingest/v2/cli/configs/__init__.py +8 -1
  7. unstructured_ingest/v2/cli/configs/filter.py +28 -0
  8. unstructured_ingest/v2/interfaces/__init__.py +2 -1
  9. unstructured_ingest/v2/interfaces/downloader.py +9 -3
  10. unstructured_ingest/v2/interfaces/file_data.py +6 -1
  11. unstructured_ingest/v2/interfaces/process.py +3 -0
  12. unstructured_ingest/v2/pipeline/interfaces.py +3 -5
  13. unstructured_ingest/v2/pipeline/pipeline.py +72 -2
  14. unstructured_ingest/v2/pipeline/steps/download.py +77 -13
  15. unstructured_ingest/v2/pipeline/steps/filter.py +40 -0
  16. unstructured_ingest/v2/processes/connectors/astra.py +8 -0
  17. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +8 -0
  18. unstructured_ingest/v2/processes/connectors/chroma.py +8 -6
  19. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +9 -0
  20. unstructured_ingest/v2/processes/connectors/elasticsearch.py +23 -9
  21. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +22 -31
  22. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -5
  23. unstructured_ingest/v2/processes/connectors/google_drive.py +13 -9
  24. unstructured_ingest/v2/processes/connectors/local.py +15 -15
  25. unstructured_ingest/v2/processes/connectors/mongodb.py +10 -4
  26. unstructured_ingest/v2/processes/connectors/onedrive.py +14 -2
  27. unstructured_ingest/v2/processes/connectors/pinecone.py +6 -3
  28. unstructured_ingest/v2/processes/connectors/salesforce.py +10 -8
  29. unstructured_ingest/v2/processes/connectors/sharepoint.py +14 -8
  30. unstructured_ingest/v2/processes/connectors/sql.py +24 -9
  31. unstructured_ingest/v2/processes/connectors/weaviate.py +13 -5
  32. unstructured_ingest/v2/processes/filter.py +54 -0
  33. {unstructured_ingest-0.0.1.dist-info → unstructured_ingest-0.0.2.dist-info}/METADATA +13 -13
  34. {unstructured_ingest-0.0.1.dist-info → unstructured_ingest-0.0.2.dist-info}/RECORD +37 -34
  35. {unstructured_ingest-0.0.1.dist-info → unstructured_ingest-0.0.2.dist-info}/WHEEL +0 -0
  36. {unstructured_ingest-0.0.1.dist-info → unstructured_ingest-0.0.2.dist-info}/entry_points.txt +0 -0
  37. {unstructured_ingest-0.0.1.dist-info → unstructured_ingest-0.0.2.dist-info}/top_level.txt +0 -0
@@ -4,13 +4,14 @@ import uuid
4
4
  from dataclasses import dataclass, field
5
5
  from datetime import date, datetime
6
6
  from pathlib import Path
7
- from typing import Any, Optional, Union
7
+ from typing import TYPE_CHECKING, Any, Callable, Optional, Union
8
8
 
9
9
  import numpy as np
10
10
  import pandas as pd
11
11
  from dateutil import parser
12
12
 
13
13
  from unstructured_ingest.enhanced_dataclass import enhanced_field
14
+ from unstructured_ingest.error import DestinationConnectionError
14
15
  from unstructured_ingest.utils.dep_check import requires_dependencies
15
16
  from unstructured_ingest.v2.interfaces import (
16
17
  AccessConfig,
@@ -25,6 +26,11 @@ from unstructured_ingest.v2.interfaces import (
25
26
  from unstructured_ingest.v2.logger import logger
26
27
  from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
27
28
 
29
+ if TYPE_CHECKING:
30
+ from sqlite3 import Connection as SqliteConnection
31
+
32
+ from psycopg2.extensions import connection as PostgresConnection
33
+
28
34
  CONNECTOR_TYPE = "sql"
29
35
  ELEMENTS_TABLE_NAME = "elements"
30
36
 
@@ -41,7 +47,7 @@ class DatabaseType(str, enum.Enum):
41
47
 
42
48
 
43
49
  @dataclass
44
- class SimpleSqlConfig(ConnectionConfig):
50
+ class SQLConnectionConfig(ConnectionConfig):
45
51
  db_type: DatabaseType = (
46
52
  # required default value here because of parent class
47
53
  DatabaseType.SQLITE
@@ -134,7 +140,7 @@ class SQLUploadStager(UploadStager):
134
140
  **kwargs: Any,
135
141
  ) -> Path:
136
142
  with open(elements_filepath) as elements_file:
137
- elements_contents = json.load(elements_file)
143
+ elements_contents: list[dict] = json.load(elements_file)
138
144
  output_path = Path(output_dir) / Path(f"{output_filename}.json")
139
145
  output_path.parent.mkdir(parents=True, exist_ok=True)
140
146
 
@@ -151,7 +157,7 @@ class SQLUploadStager(UploadStager):
151
157
  data["id"] = str(uuid.uuid4())
152
158
 
153
159
  # remove extraneous, not supported columns
154
- [data.pop(column) for column in data if column not in _COLUMNS]
160
+ data = {k: v for k, v in data.items() if k in _COLUMNS}
155
161
 
156
162
  output.append(data)
157
163
 
@@ -185,23 +191,32 @@ class SQLUploaderConfig(UploaderConfig):
185
191
  class SQLUploader(Uploader):
186
192
  connector_type: str = CONNECTOR_TYPE
187
193
  upload_config: SQLUploaderConfig
188
- connection_config: SimpleSqlConfig
194
+ connection_config: SQLConnectionConfig
195
+
196
+ def precheck(self) -> None:
197
+ try:
198
+ cursor = self.connection().cursor()
199
+ cursor.execute("SELECT 1;")
200
+ cursor.close()
201
+ except Exception as e:
202
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
203
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
189
204
 
190
205
  @property
191
- def connection(self):
206
+ def connection(self) -> Callable[[], Union["SqliteConnection", "PostgresConnection"]]:
192
207
  if self.connection_config.db_type == DatabaseType.POSTGRESQL:
193
208
  return self._make_psycopg_connection
194
209
  elif self.connection_config.db_type == DatabaseType.SQLITE:
195
210
  return self._make_sqlite_connection
196
211
  raise ValueError(f"Unsupported database {self.connection_config.db_type} connection.")
197
212
 
198
- def _make_sqlite_connection(self):
213
+ def _make_sqlite_connection(self) -> "SqliteConnection":
199
214
  from sqlite3 import connect
200
215
 
201
216
  return connect(database=self.connection_config.database)
202
217
 
203
218
  @requires_dependencies(["psycopg2"], extras="postgres")
204
- def _make_psycopg_connection(self):
219
+ def _make_psycopg_connection(self) -> "PostgresConnection":
205
220
  from psycopg2 import connect
206
221
 
207
222
  return connect(
@@ -261,7 +276,7 @@ class SQLUploader(Uploader):
261
276
 
262
277
 
263
278
  sql_destination_entry = DestinationRegistryEntry(
264
- connection_config=SimpleSqlConfig,
279
+ connection_config=SQLConnectionConfig,
265
280
  uploader=SQLUploader,
266
281
  uploader_config=SQLUploaderConfig,
267
282
  upload_stager=SQLUploadStager,
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Optional
7
7
  from dateutil import parser
8
8
 
9
9
  from unstructured_ingest.enhanced_dataclass import enhanced_field
10
+ from unstructured_ingest.error import DestinationConnectionError
10
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
11
12
  from unstructured_ingest.v2.interfaces import (
12
13
  AccessConfig,
@@ -156,15 +157,21 @@ class WeaviateUploaderConfig(UploaderConfig):
156
157
  class WeaviateUploader(Uploader):
157
158
  upload_config: WeaviateUploaderConfig
158
159
  connection_config: WeaviateConnectionConfig
159
- client: Optional["Client"] = field(init=False)
160
160
  connector_type: str = CONNECTOR_TYPE
161
161
 
162
162
  @requires_dependencies(["weaviate"], extras="weaviate")
163
- def __post_init__(self):
163
+ def get_client(self) -> "Client":
164
164
  from weaviate import Client
165
165
 
166
166
  auth = self._resolve_auth_method()
167
- self.client = Client(url=self.connection_config.host_url, auth_client_secret=auth)
167
+ return Client(url=self.connection_config.host_url, auth_client_secret=auth)
168
+
169
+ def precheck(self) -> None:
170
+ try:
171
+ self.get_client()
172
+ except Exception as e:
173
+ logger.error(f"Failed to validate connection {e}", exc_info=True)
174
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
168
175
 
169
176
  @requires_dependencies(["weaviate"], extras="weaviate")
170
177
  def _resolve_auth_method(self):
@@ -215,8 +222,9 @@ class WeaviateUploader(Uploader):
215
222
  f"at {self.connection_config.host_url}",
216
223
  )
217
224
 
218
- self.client.batch.configure(batch_size=self.upload_config.batch_size)
219
- with self.client.batch as b:
225
+ client = self.get_client()
226
+ client.batch.configure(batch_size=self.upload_config.batch_size)
227
+ with client.batch as b:
220
228
  for e in elements_dict:
221
229
  vector = e.pop("embeddings", None)
222
230
  b.add_data_object(
@@ -0,0 +1,54 @@
1
+ import fnmatch
2
+ from abc import ABC
3
+ from dataclasses import dataclass, field
4
+ from typing import Any, Callable, Optional
5
+
6
+ from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
7
+ from unstructured_ingest.v2.interfaces import FileData
8
+ from unstructured_ingest.v2.interfaces.process import BaseProcess
9
+ from unstructured_ingest.v2.logger import logger
10
+
11
+
12
+ @dataclass
13
+ class FiltererConfig(EnhancedDataClassJsonMixin):
14
+ file_glob: Optional[list[str]] = None
15
+ max_file_size: Optional[int] = None
16
+
17
+
18
+ @dataclass
19
+ class Filterer(BaseProcess, ABC):
20
+ config: FiltererConfig = field(default_factory=lambda: FiltererConfig())
21
+ filters: list[Callable[[FileData], bool]] = field(init=False, default_factory=list)
22
+
23
+ def __post_init__(self):
24
+ # Populate the filters based on values in config
25
+ if self.config.file_glob is not None:
26
+ self.filters.append(self.glob_filter)
27
+ if self.config.max_file_size:
28
+ self.filters.append(self.file_size_filter)
29
+
30
+ def is_async(self) -> bool:
31
+ return False
32
+
33
+ def file_size_filter(self, file_data: FileData) -> bool:
34
+ if filesize_bytes := file_data.metadata.filesize_bytes:
35
+ return filesize_bytes <= self.config.max_file_size
36
+ return True
37
+
38
+ def glob_filter(self, file_data: FileData) -> bool:
39
+ patterns = self.config.file_glob
40
+ path = file_data.source_identifiers.fullpath
41
+ for pattern in patterns:
42
+ if fnmatch.filter([path], pattern):
43
+ return True
44
+ logger.debug(f"The file {path!r} is discarded as it does not match any given glob.")
45
+ return False
46
+
47
+ def run(self, file_data: FileData, **kwargs: Any) -> Optional[FileData]:
48
+ for filter in self.filters:
49
+ if not filter(file_data):
50
+ logger.debug(
51
+ f"filtered out file data due to {filter.__name__}: {file_data.identifier}"
52
+ )
53
+ return None
54
+ return file_data
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.0.1
3
+ Version: 0.0.2
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -21,16 +21,16 @@ Classifier: Programming Language :: Python :: 3.12
21
21
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.13
23
23
  Description-Content-Type: text/markdown
24
- Requires-Dist: pandas
25
24
  Requires-Dist: unstructured
26
25
  Requires-Dist: python-dateutil
26
+ Requires-Dist: pandas
27
27
  Provides-Extra: airtable
28
28
  Requires-Dist: pyairtable ; extra == 'airtable'
29
29
  Provides-Extra: astra
30
30
  Requires-Dist: astrapy ; extra == 'astra'
31
31
  Provides-Extra: azure
32
- Requires-Dist: adlfs ; extra == 'azure'
33
32
  Requires-Dist: fsspec ; extra == 'azure'
33
+ Requires-Dist: adlfs ; extra == 'azure'
34
34
  Provides-Extra: azure-cognitive-search
35
35
  Requires-Dist: azure-search-documents ; extra == 'azure-cognitive-search'
36
36
  Provides-Extra: bedrock
@@ -39,12 +39,12 @@ Requires-Dist: boto3 ; extra == 'bedrock'
39
39
  Provides-Extra: biomed
40
40
  Requires-Dist: bs4 ; extra == 'biomed'
41
41
  Provides-Extra: box
42
- Requires-Dist: fsspec ; extra == 'box'
43
42
  Requires-Dist: boxfs ; extra == 'box'
43
+ Requires-Dist: fsspec ; extra == 'box'
44
44
  Provides-Extra: chroma
45
+ Requires-Dist: importlib-metadata >=7.1.0 ; extra == 'chroma'
45
46
  Requires-Dist: typer <=0.9.0 ; extra == 'chroma'
46
47
  Requires-Dist: chromadb ; extra == 'chroma'
47
- Requires-Dist: importlib-metadata >=7.1.0 ; extra == 'chroma'
48
48
  Provides-Extra: clarifai
49
49
  Requires-Dist: clarifai ; extra == 'clarifai'
50
50
  Provides-Extra: confluence
@@ -54,8 +54,8 @@ Requires-Dist: unstructured[tsv] ; extra == 'csv'
54
54
  Provides-Extra: databricks-volumes
55
55
  Requires-Dist: databricks-sdk ; extra == 'databricks-volumes'
56
56
  Provides-Extra: delta-table
57
- Requires-Dist: fsspec ; extra == 'delta-table'
58
57
  Requires-Dist: deltalake ; extra == 'delta-table'
58
+ Requires-Dist: fsspec ; extra == 'delta-table'
59
59
  Provides-Extra: discord
60
60
  Requires-Dist: discord-py ; extra == 'discord'
61
61
  Provides-Extra: doc
@@ -69,8 +69,8 @@ Provides-Extra: elasticsearch
69
69
  Requires-Dist: elasticsearch[async] ; extra == 'elasticsearch'
70
70
  Provides-Extra: embed-huggingface
71
71
  Requires-Dist: huggingface ; extra == 'embed-huggingface'
72
- Requires-Dist: langchain-community ; extra == 'embed-huggingface'
73
72
  Requires-Dist: sentence-transformers ; extra == 'embed-huggingface'
73
+ Requires-Dist: langchain-community ; extra == 'embed-huggingface'
74
74
  Provides-Extra: embed-octoai
75
75
  Requires-Dist: tiktoken ; extra == 'embed-octoai'
76
76
  Requires-Dist: openai ; extra == 'embed-octoai'
@@ -79,8 +79,8 @@ Requires-Dist: langchain-community ; extra == 'embed-vertexai'
79
79
  Requires-Dist: langchain ; extra == 'embed-vertexai'
80
80
  Requires-Dist: langchain-google-vertexai ; extra == 'embed-vertexai'
81
81
  Provides-Extra: embed-voyageai
82
- Requires-Dist: langchain-voyageai ; extra == 'embed-voyageai'
83
82
  Requires-Dist: langchain ; extra == 'embed-voyageai'
83
+ Requires-Dist: langchain-voyageai ; extra == 'embed-voyageai'
84
84
  Provides-Extra: epub
85
85
  Requires-Dist: unstructured[epub] ; extra == 'epub'
86
86
  Provides-Extra: gcs
@@ -114,20 +114,20 @@ Requires-Dist: notion-client ; extra == 'notion'
114
114
  Provides-Extra: odt
115
115
  Requires-Dist: unstructured[odt] ; extra == 'odt'
116
116
  Provides-Extra: onedrive
117
+ Requires-Dist: msal ; extra == 'onedrive'
117
118
  Requires-Dist: bs4 ; extra == 'onedrive'
118
119
  Requires-Dist: Office365-REST-Python-Client ; extra == 'onedrive'
119
- Requires-Dist: msal ; extra == 'onedrive'
120
120
  Provides-Extra: openai
121
121
  Requires-Dist: tiktoken ; extra == 'openai'
122
- Requires-Dist: langchain-community ; extra == 'openai'
123
122
  Requires-Dist: openai ; extra == 'openai'
123
+ Requires-Dist: langchain-community ; extra == 'openai'
124
124
  Provides-Extra: opensearch
125
125
  Requires-Dist: opensearch-py ; extra == 'opensearch'
126
126
  Provides-Extra: org
127
127
  Requires-Dist: unstructured[org] ; extra == 'org'
128
128
  Provides-Extra: outlook
129
- Requires-Dist: Office365-REST-Python-Client ; extra == 'outlook'
130
129
  Requires-Dist: msal ; extra == 'outlook'
130
+ Requires-Dist: Office365-REST-Python-Client ; extra == 'outlook'
131
131
  Provides-Extra: pdf
132
132
  Requires-Dist: unstructured[pdf] ; extra == 'pdf'
133
133
  Provides-Extra: pinecone
@@ -147,16 +147,16 @@ Requires-Dist: unstructured[rst] ; extra == 'rst'
147
147
  Provides-Extra: rtf
148
148
  Requires-Dist: unstructured[rtf] ; extra == 'rtf'
149
149
  Provides-Extra: s3
150
- Requires-Dist: fsspec ; extra == 's3'
151
150
  Requires-Dist: s3fs ; extra == 's3'
151
+ Requires-Dist: fsspec ; extra == 's3'
152
152
  Provides-Extra: salesforce
153
153
  Requires-Dist: simple-salesforce ; extra == 'salesforce'
154
154
  Provides-Extra: sftp
155
155
  Requires-Dist: paramiko ; extra == 'sftp'
156
156
  Requires-Dist: fsspec ; extra == 'sftp'
157
157
  Provides-Extra: sharepoint
158
- Requires-Dist: Office365-REST-Python-Client ; extra == 'sharepoint'
159
158
  Requires-Dist: msal ; extra == 'sharepoint'
159
+ Requires-Dist: Office365-REST-Python-Client ; extra == 'sharepoint'
160
160
  Provides-Extra: singlestore
161
161
  Requires-Dist: singlestoredb ; extra == 'singlestore'
162
162
  Provides-Extra: slack
@@ -1,5 +1,5 @@
1
1
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
2
- unstructured_ingest/__version__.py,sha256=SI019rW6paHw93e6fOWFzF9TruLom8o9HrgZsjGZvaE,42
2
+ unstructured_ingest/__version__.py,sha256=t0CFzEk7qlIWbgyEWA53ytTKmHbZ9ow2lAyjeP1bFqw,42
3
3
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
4
4
  unstructured_ingest/evaluate.py,sha256=R-mKLFXbVX1xQ1tjGsLHjdP-TbSSV-925IHzggW_bIg,9793
5
5
  unstructured_ingest/interfaces.py,sha256=uS8L5mS0mXD8I4XTfVlKZxAwqnpJ4yrRqn4vxWVRhQI,31107
@@ -260,10 +260,10 @@ unstructured_ingest/v2/cli/cli.py,sha256=qHXIs-PcvMgDZhP1AR9iDMxh8FXBMJCEDksPBfi
260
260
  unstructured_ingest/v2/cli/interfaces.py,sha256=4Bbow6QHks2a1H56tmVQ4vG3sZy-577ZbwrPmDfizmE,829
261
261
  unstructured_ingest/v2/cli/utils.py,sha256=QK-ee6FzxPf-IbaNXXWlH-GhvqeNnjK2m8ljBD1SusU,9075
262
262
  unstructured_ingest/v2/cli/base/__init__.py,sha256=zXCa7F4FMqItmzxfUIVmyI-CeGh8X85yF8lRxwX_OYQ,83
263
- unstructured_ingest/v2/cli/base/cmd.py,sha256=qVHmquVsVDoYyPByKdUTVCwAFfILMYBw5w6eTTVku-E,9308
263
+ unstructured_ingest/v2/cli/base/cmd.py,sha256=qi9N5rcyyE2nmswFaoKWbs1PonhHsMC5llqND9-rQso,9790
264
264
  unstructured_ingest/v2/cli/base/dest.py,sha256=YMbVIHmYDqvOtxZeEY93stmF2p2ImjuJts7-u-NznYw,2887
265
265
  unstructured_ingest/v2/cli/base/importer.py,sha256=nRt0QQ3qpi264-n_mR0l55C2ddM8nowTNzT1jsWaam8,1128
266
- unstructured_ingest/v2/cli/base/src.py,sha256=7LnZh9FgUX9rerBH6cizVtTWmM6R2sRkxatnGsxYHG0,2410
266
+ unstructured_ingest/v2/cli/base/src.py,sha256=oUPO9GPEbkYm1udfD4YQBYTfaefbhpoIN1HPnD672SQ,2460
267
267
  unstructured_ingest/v2/cli/cmds/__init__.py,sha256=DWPMD6Wqus22sSoIEyTSiOJAm97aNjvdpdrXgsL4uQ0,2647
268
268
  unstructured_ingest/v2/cli/cmds/astra.py,sha256=L-GR2KSP_cFQkQm0aVcdiXmgYMJZCVKIAH794y8qT1M,2590
269
269
  unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py,sha256=VTCSUYeIYKnP60lC7DeBYqoqAJnWuBZrwevCXbeIEzw,2248
@@ -271,7 +271,7 @@ unstructured_ingest/v2/cli/cmds/chroma.py,sha256=RinNOPripk2zRYx1Rt-u-jywXbwh7Js
271
271
  unstructured_ingest/v2/cli/cmds/databricks_volumes.py,sha256=53d9A7UunJLYZFwwwHEraVshFc3gSzUbmKjMOiv7hn4,5920
272
272
  unstructured_ingest/v2/cli/cmds/elasticsearch.py,sha256=joUfnV992fAwEDCtFVJaABwgpyQiWeDl1ZCBEudRtnk,5258
273
273
  unstructured_ingest/v2/cli/cmds/google_drive.py,sha256=mXozabpi8kjRFb0S7kw-xMGtEuFoVUxnvefwL5ZIPHc,2334
274
- unstructured_ingest/v2/cli/cmds/local.py,sha256=lGBFOVDRlrcCtPFjyk0IAYHLRWg95Kunu1Kli7t0ZK4,1899
274
+ unstructured_ingest/v2/cli/cmds/local.py,sha256=UOTYjSdNqCFxhZfN6bdxm8jRp6Ijun2K-WpQq1X83OQ,1544
275
275
  unstructured_ingest/v2/cli/cmds/milvus.py,sha256=PB1ib1rFGGH_-KDi1bSIO3BIiVcqSJEHCBFFrzQrnmI,1998
276
276
  unstructured_ingest/v2/cli/cmds/mongodb.py,sha256=oyV6tacuuxm3dN-AXQgbxvYJiDYo2OOWQKRSBCUGj0E,1823
277
277
  unstructured_ingest/v2/cli/cmds/onedrive.py,sha256=DKqhQyyF-swZxs3C9G5W8ECleq8sWpDbpTuiAHXukXQ,2781
@@ -286,32 +286,34 @@ unstructured_ingest/v2/cli/cmds/fsspec/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JC
286
286
  unstructured_ingest/v2/cli/cmds/fsspec/azure.py,sha256=ZHfchzSpGkZ99Fq1050JvHP0-aG1pZsBZxxozcFfxwI,2784
287
287
  unstructured_ingest/v2/cli/cmds/fsspec/box.py,sha256=kslkI-0-GyGSJOU7bKgrZeQRXh8HFexDq87ew8kT8kE,1338
288
288
  unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py,sha256=LtcR3rCQPgzJNbV3S90HlL0LPPbW9lYEfE8BG4F-dSI,1349
289
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py,sha256=Mgr_nto1FWxnGmbGdVlIfm-xuBGL0HEi8k3FEmQnZng,2414
289
+ unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py,sha256=BlJLEu6YJUejLLFzlSCVSoZDp2pdjoTsdoFFHVwwkVY,2031
290
290
  unstructured_ingest/v2/cli/cmds/fsspec/gcs.py,sha256=3-0LYnDs0fgNrDqnHpNZKj_6rwNj9wQVaV0lGOhVFPE,2737
291
291
  unstructured_ingest/v2/cli/cmds/fsspec/s3.py,sha256=EXQzYkDtkFli2sfcj4cRDRPFac7b7z1DfQqYlGQcE6o,2279
292
292
  unstructured_ingest/v2/cli/cmds/fsspec/sftp.py,sha256=YY2xKguawMyLdcG0qDYKUgk7DT0KgyZJlV17MfwIhpo,2036
293
- unstructured_ingest/v2/cli/configs/__init__.py,sha256=5NMXm872QQZTvUFZFS06c8c1b6K940K5gxs9lbp8W6M,258
293
+ unstructured_ingest/v2/cli/configs/__init__.py,sha256=nAJ1gT3yCAzoZbEbYswE2SMfSk7TEPxa_1v3qEUsgIQ,336
294
294
  unstructured_ingest/v2/cli/configs/chunk.py,sha256=KvIhmIRIZxazCumMztAKdWs-4MK7qzOb5h6Ned_2bdU,3547
295
295
  unstructured_ingest/v2/cli/configs/embed.py,sha256=q_TwnkxKTKOsMgVYfW6xxbD8FWjU_Uh_X2BQ5-_VLGM,2725
296
+ unstructured_ingest/v2/cli/configs/filter.py,sha256=KYe65_au6m7H4VrjgugC2ain6vsUSWswNSEgcG66VPU,841
296
297
  unstructured_ingest/v2/cli/configs/partition.py,sha256=7wdI18V6c4kaXuf50Lh66n9LbtrYHYd8ffEgDQLqvSk,3931
297
298
  unstructured_ingest/v2/cli/configs/processor.py,sha256=ZHu2DBIuE8VgL3mEt73yYimw2k_PaOEtdxxFqzHfk84,3350
298
- unstructured_ingest/v2/interfaces/__init__.py,sha256=-CHWUlT4rISd-gSfcGKGYFqqSFhMY9lKsT5wxwmOThM,845
299
+ unstructured_ingest/v2/interfaces/__init__.py,sha256=Rfa8crx6De7WNOK-EjsWWwFVpsUfCc6gY8B8tQ3ae9I,899
299
300
  unstructured_ingest/v2/interfaces/connector.py,sha256=u4hE1DpTPDC04-n_IzYyn9w1gNCiPT81anrUoEh30Z8,855
300
- unstructured_ingest/v2/interfaces/downloader.py,sha256=aWlacZZrI6SGw6retnRJtZbqT5voOYq_fb326ynNOhI,2506
301
- unstructured_ingest/v2/interfaces/file_data.py,sha256=5TCMkblUW-Jvy-rS5FqRT22VzDmJqAiQRIWYarpAi64,1543
301
+ unstructured_ingest/v2/interfaces/downloader.py,sha256=zs7cxhzbWVc5L0bV4gdCTexWGMVeXTQ9jJF6PCYSAss,2790
302
+ unstructured_ingest/v2/interfaces/file_data.py,sha256=PZrPJBkNC63lNO_1nwvnAeKRxjM3CsjIY6jSO8T9bVM,1665
302
303
  unstructured_ingest/v2/interfaces/indexer.py,sha256=pMw0abNHk_tEuA4BkXX1BdAfIwHdytxj7s6tGxMvYRE,821
303
- unstructured_ingest/v2/interfaces/process.py,sha256=0ecz7mAjlY_DUi9-HhPc9zXphmGclispYwv37O8gvJ0,466
304
+ unstructured_ingest/v2/interfaces/process.py,sha256=_l4dyaM0u0XxTqQw1Ghr8k2QMpQJMFapLOLhWqSdTdo,512
304
305
  unstructured_ingest/v2/interfaces/processor.py,sha256=uHVHeKo5Gt_zFkaEXw7xgaCBDTEl2-Amh-ByA07258o,1620
305
306
  unstructured_ingest/v2/interfaces/upload_stager.py,sha256=SylhDl9pK6qa7hvfrhpabCkjwE03yIlI6oM-mQnqtho,1220
306
307
  unstructured_ingest/v2/interfaces/uploader.py,sha256=bzfx3Ei4poXKu-hsgjAB4sj4jKij9CoaRSadUM5LtGk,1083
307
308
  unstructured_ingest/v2/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
308
- unstructured_ingest/v2/pipeline/interfaces.py,sha256=Zz76fLHNKw6BDsBSYQXiRa6CvyW91ulvZU0yw5vVQ5M,6544
309
- unstructured_ingest/v2/pipeline/pipeline.py,sha256=r8jRMZI2RF8GQIuTcjIFBDeFtMnqpOJmKhEriy6Vo5Y,11616
309
+ unstructured_ingest/v2/pipeline/interfaces.py,sha256=Z50-6XFZNajfmJbLKunLxw3RuYMzCYiUp6F0jhQwERE,6441
310
+ unstructured_ingest/v2/pipeline/pipeline.py,sha256=dqn4_O4il6gZ33mE0DVC1wQKRcXMrD_jll999NoyQgw,14283
310
311
  unstructured_ingest/v2/pipeline/utils.py,sha256=oPAitfdnITqh2O8Z0uf6VOHg9BTJhitRzNmKXqTwPxg,422
311
312
  unstructured_ingest/v2/pipeline/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
312
313
  unstructured_ingest/v2/pipeline/steps/chunk.py,sha256=lfCsBo6A9u1cT57YaEjvNI79gc29nW8c-2_WZNjiO5Y,3275
313
- unstructured_ingest/v2/pipeline/steps/download.py,sha256=GA5-zTH4c7Ac8oBoc4AeDA8sQ0dYT-KUOZ4n31K3Jpg,4882
314
+ unstructured_ingest/v2/pipeline/steps/download.py,sha256=qYeuRU5jeICyuTN7E4YUdnbi6X1X2qKoooJMm4Orbdw,7499
314
315
  unstructured_ingest/v2/pipeline/steps/embed.py,sha256=VCdDBUXK6Yx8RTvRBpEFdFE7n0izvkP73w6s8Tv2sgg,3253
316
+ unstructured_ingest/v2/pipeline/steps/filter.py,sha256=mYVccl_zp0CGYFWBrSrPelvSElrXhZahebuymGuirV8,1341
315
317
  unstructured_ingest/v2/pipeline/steps/index.py,sha256=i4RcJ1oRqNp-rFdc6rvKVGcSzNhdB7woW7_W364uThQ,2269
316
318
  unstructured_ingest/v2/pipeline/steps/partition.py,sha256=q7-rpCj5Vy4BXtd7T72gxGb3xg6lmVyNmTwUfHil7Rg,3199
317
319
  unstructured_ingest/v2/pipeline/steps/stage.py,sha256=A8i6VAFY4_xFJR0uBEyBNJlQXmTMGaflXsa6Wa6U1wQ,2274
@@ -321,38 +323,39 @@ unstructured_ingest/v2/processes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
321
323
  unstructured_ingest/v2/processes/chunker.py,sha256=U6zQhaUG_dii66zqL9iEEGodHENNxnpn6V3pC-e7MMI,4233
322
324
  unstructured_ingest/v2/processes/connector_registry.py,sha256=KOrvJNNRdpBPyqFwmTm42kD1xXuo7fNS_5yXjtqAz-c,2100
323
325
  unstructured_ingest/v2/processes/embedder.py,sha256=QjAsiXAjWtZzh6lJ4D5LsTMBD81zuMBkegXNWq-FZt0,3308
326
+ unstructured_ingest/v2/processes/filter.py,sha256=CfQihLV_r4yTJgAc66mmbP4_xo3wcDlro5oR_KR--bM,1986
324
327
  unstructured_ingest/v2/processes/partitioner.py,sha256=f6UQoQHVKjl8rmM5J9EcuP30RTFLSLrArGdC6qh-ffE,7645
325
328
  unstructured_ingest/v2/processes/uncompress.py,sha256=x-JZYNs1zJOtRS7xNgiMyrYoAbzKM0p18O8NAl7avCA,1631
326
329
  unstructured_ingest/v2/processes/connectors/__init__.py,sha256=7QMKd8gtEJTIuK352Ho6XyoFvLLhrWIzgdu0dXwXWOE,3960
327
- unstructured_ingest/v2/processes/connectors/astra.py,sha256=TSI_3GHnEh3gYAC30RTG4b2eEB07agroEFmJ38GnQY4,4903
328
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py,sha256=PT02ZKiJuHMrmBClxqBsyDS0aXUQYLVg02Ns2qh1hD4,7935
329
- unstructured_ingest/v2/processes/connectors/chroma.py,sha256=nYzNz-8oq-DN0Z4r7lHQFmlved76IaYeRvm7-EmbGUE,6998
330
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py,sha256=MTLK7SvQqWU-PXmEbGajM4f-CqGWlmlfeED6a5StEWw,3226
331
- unstructured_ingest/v2/processes/connectors/elasticsearch.py,sha256=6QBvVzPk3mWj9ZqJZN7NvhcJaOO6nSLqLwU6zggP59A,14864
332
- unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=IkLVafUu280OOoqYmdfdfMB6zlpiWjs2Z5J31ZzJOj4,12681
333
- unstructured_ingest/v2/processes/connectors/local.py,sha256=maAXVKpRRXj_jseC6EPLTosMgw6ll-0lnGsDdAFLWAE,6646
330
+ unstructured_ingest/v2/processes/connectors/astra.py,sha256=m6A34wYjnctRfIF-14bnbGIFBwht5Y8UWZ4g8R9x6a8,5241
331
+ unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py,sha256=N_--5t_hxFzFEK4vERzm46gfg-Ghozb71_NmUlEYIMA,8277
332
+ unstructured_ingest/v2/processes/connectors/chroma.py,sha256=W995GLn7D85GoUhSqHQXP5QQ8OglgykA5rcNmg9Ruf4,7158
333
+ unstructured_ingest/v2/processes/connectors/databricks_volumes.py,sha256=8bVht450bxp0K4ub1XdCDmMKEooXVV4DNY5b5GWF0Ig,3636
334
+ unstructured_ingest/v2/processes/connectors/elasticsearch.py,sha256=myY2FRXtlBYhH-kbTSsn7j9UDzh36NYHqFRP-ys8am4,15358
335
+ unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=-iYpwt4xxaICRlHD5Bpap7Ck5HRJcapa6uHl60E1uZ4,12702
336
+ unstructured_ingest/v2/processes/connectors/local.py,sha256=IJ5DjASp-5lPmb6J7Y8NROYjIS3sfdRhlcDAZEEGVAw,6573
334
337
  unstructured_ingest/v2/processes/connectors/milvus.py,sha256=FWH4FH-zns7gh8sITg9pLYE9uKm_3GeOXJ4wjY6PMno,6776
335
- unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=ErZWAD-su3OCRGv1h84X1PpAWleUPVZcFDEIYjtyP4E,4310
336
- unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=WDDoFEfd8M_QBTpkGNI2zZGZZ_CR1rQiCsBWYOO2JoA,8311
338
+ unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=XZCgkF28HCR4DtMmr8jlxb59txXgEvfCabovROUrv6Y,4602
339
+ unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=_TFO-vlyCxIxMk6hv20CEsicrlh87wCrbi4I1chsMUw,8822
337
340
  unstructured_ingest/v2/processes/connectors/opensearch.py,sha256=HNRZVQsWnjLLm0yAGiIyHRbhAsBnGSXBO_VkUfIdwdE,5463
338
- unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=PtAodxemYgiBZESx-g9a8fcL6XagJd9DIDQjrhE8aPk,5746
339
- unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=Cz4qEtnbsD9-m1DXANxnVRZTHX2ZaUUBPVFPu5wnFRk,10832
340
- unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=SNovgGUE5tHdfX_lF5zwM_QRZK7mahHzLZKhnqfk6Tc,17696
341
+ unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=0rpOh_pi4GormyIQsnEJbKVb7FeizAbLcbljpnjtpeY,5908
342
+ unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=S0dEjT1UxReCC6qE9DlSQBgcSzQbOaIq7SMJqXUpNWQ,10858
343
+ unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=NRn0lbOuXqIYqZT15IVFeFQCxpCKzZC_M8pVYZeeNfo,17933
341
344
  unstructured_ingest/v2/processes/connectors/singlestore.py,sha256=upF2O4hJ2uiBhDRrpQ8CSJUvzmqu2j5H1b_QbReHJpw,5168
342
- unstructured_ingest/v2/processes/connectors/sql.py,sha256=T0rpCbhEipWlezoJOMiUewcZuk6Had6TkmsDT-PeOL0,8360
345
+ unstructured_ingest/v2/processes/connectors/sql.py,sha256=mbhBI2tcX4q1YJwR3Nr7HGbr-rb8ppUYq2JcN88We3U,9076
343
346
  unstructured_ingest/v2/processes/connectors/utils.py,sha256=nmpZZCeX0O7rGrwHSWM_heBgpZK9tKT6EV1Moer-z40,576
344
- unstructured_ingest/v2/processes/connectors/weaviate.py,sha256=7H3s44zVKbN3_eR35sbKTKSDOt6ZIIQkX-4t65LuJ6c,8254
347
+ unstructured_ingest/v2/processes/connectors/weaviate.py,sha256=HtJuOUhBs_HA7uOXlEIuYtx0elb0ecsCvP8N822tOMQ,8564
345
348
  unstructured_ingest/v2/processes/connectors/fsspec/__init__.py,sha256=TtdeImM7Ypl_n6sl7I1JqX6bGSG0t_FqvCqE3Cy24og,1846
346
349
  unstructured_ingest/v2/processes/connectors/fsspec/azure.py,sha256=RN7zoifocIWVgoP9aMDMz4TP-Z9KhE-HbCCBq33fY90,4674
347
350
  unstructured_ingest/v2/processes/connectors/fsspec/box.py,sha256=UnD-F9g7yOOBStrAqeKq6GuQjEyHdwOA3jYLj8YZIRM,4088
348
351
  unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py,sha256=I6mPG9EIso9TcIczCw5Y14Yqd-EhTQ2CLw1MJx1V3dY,4420
349
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=gNgrRqKqk9YpBRGqGPvBUuEcBv1jN59fmBBj6NrB4sA,12394
352
+ unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=zKrwKTVGnhnitD8h_Url5HRFsJZjM66o3jWrzAm-_UA,12153
350
353
  unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=RYZq_8hKF7bRxuB5Gozv5AzB3_nTuuooE4UfRjXwEFU,4443
351
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=7lOm5hjb0LBkbe-OWXnV3wDC-3mM_GWwwmdKW0xzh8c,5333
354
+ unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=PXK9a5O3woDuBWSf4R5XLQI5mzHtap8wAKpHI8Rh5gQ,5462
352
355
  unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=J7Ej-j7dtXAluHunwynUfHlNsYwymb-LsrGUFcljcsA,5700
353
356
  unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
354
- unstructured_ingest-0.0.1.dist-info/METADATA,sha256=Qru27Cxrf0C-vFe7MqfaKOfavazrWYTTRif6loKf71o,21568
355
- unstructured_ingest-0.0.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
356
- unstructured_ingest-0.0.1.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
357
- unstructured_ingest-0.0.1.dist-info/top_level.txt,sha256=QaTxTcjfM5Hr9sZJ6weOJvSe5ESQc0F8AWkhHInTCf8,20
358
- unstructured_ingest-0.0.1.dist-info/RECORD,,
357
+ unstructured_ingest-0.0.2.dist-info/METADATA,sha256=a68Sz8-m1-ZRFz0p4yic64BhgwTuMdIMmCuPECdhWwA,21568
358
+ unstructured_ingest-0.0.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
359
+ unstructured_ingest-0.0.2.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
360
+ unstructured_ingest-0.0.2.dist-info/top_level.txt,sha256=QaTxTcjfM5Hr9sZJ6weOJvSe5ESQc0F8AWkhHInTCf8,20
361
+ unstructured_ingest-0.0.2.dist-info/RECORD,,