unstructured-ingest 0.0.2__py3-none-any.whl → 0.0.2.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/cli/base/cmd.py +0 -10
- unstructured_ingest/v2/cli/base/src.py +0 -2
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +9 -1
- unstructured_ingest/v2/cli/cmds/local.py +8 -0
- unstructured_ingest/v2/cli/configs/__init__.py +1 -8
- unstructured_ingest/v2/interfaces/__init__.py +1 -2
- unstructured_ingest/v2/interfaces/downloader.py +3 -9
- unstructured_ingest/v2/interfaces/file_data.py +1 -6
- unstructured_ingest/v2/interfaces/process.py +0 -3
- unstructured_ingest/v2/pipeline/interfaces.py +5 -3
- unstructured_ingest/v2/pipeline/pipeline.py +2 -72
- unstructured_ingest/v2/pipeline/steps/download.py +13 -77
- unstructured_ingest/v2/processes/connectors/astra.py +0 -8
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +0 -8
- unstructured_ingest/v2/processes/connectors/chroma.py +6 -8
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +0 -9
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +9 -23
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +27 -12
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +5 -13
- unstructured_ingest/v2/processes/connectors/google_drive.py +9 -13
- unstructured_ingest/v2/processes/connectors/local.py +15 -15
- unstructured_ingest/v2/processes/connectors/mongodb.py +4 -10
- unstructured_ingest/v2/processes/connectors/onedrive.py +2 -14
- unstructured_ingest/v2/processes/connectors/pinecone.py +3 -6
- unstructured_ingest/v2/processes/connectors/salesforce.py +8 -10
- unstructured_ingest/v2/processes/connectors/sharepoint.py +8 -14
- unstructured_ingest/v2/processes/connectors/sql.py +9 -24
- unstructured_ingest/v2/processes/connectors/weaviate.py +5 -13
- {unstructured_ingest-0.0.2.dist-info → unstructured_ingest-0.0.2.dev0.dist-info}/METADATA +15 -15
- {unstructured_ingest-0.0.2.dist-info → unstructured_ingest-0.0.2.dev0.dist-info}/RECORD +34 -37
- unstructured_ingest/v2/cli/configs/filter.py +0 -28
- unstructured_ingest/v2/pipeline/steps/filter.py +0 -40
- unstructured_ingest/v2/processes/filter.py +0 -54
- {unstructured_ingest-0.0.2.dist-info → unstructured_ingest-0.0.2.dev0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.2.dist-info → unstructured_ingest-0.0.2.dev0.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.2.dist-info → unstructured_ingest-0.0.2.dev0.dist-info}/top_level.txt +0 -0
|
@@ -4,14 +4,13 @@ import uuid
|
|
|
4
4
|
from dataclasses import dataclass, field
|
|
5
5
|
from datetime import date, datetime
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import
|
|
7
|
+
from typing import Any, Optional, Union
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
import pandas as pd
|
|
11
11
|
from dateutil import parser
|
|
12
12
|
|
|
13
13
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
14
|
-
from unstructured_ingest.error import DestinationConnectionError
|
|
15
14
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
16
15
|
from unstructured_ingest.v2.interfaces import (
|
|
17
16
|
AccessConfig,
|
|
@@ -26,11 +25,6 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
26
25
|
from unstructured_ingest.v2.logger import logger
|
|
27
26
|
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
28
27
|
|
|
29
|
-
if TYPE_CHECKING:
|
|
30
|
-
from sqlite3 import Connection as SqliteConnection
|
|
31
|
-
|
|
32
|
-
from psycopg2.extensions import connection as PostgresConnection
|
|
33
|
-
|
|
34
28
|
CONNECTOR_TYPE = "sql"
|
|
35
29
|
ELEMENTS_TABLE_NAME = "elements"
|
|
36
30
|
|
|
@@ -47,7 +41,7 @@ class DatabaseType(str, enum.Enum):
|
|
|
47
41
|
|
|
48
42
|
|
|
49
43
|
@dataclass
|
|
50
|
-
class
|
|
44
|
+
class SimpleSqlConfig(ConnectionConfig):
|
|
51
45
|
db_type: DatabaseType = (
|
|
52
46
|
# required default value here because of parent class
|
|
53
47
|
DatabaseType.SQLITE
|
|
@@ -140,7 +134,7 @@ class SQLUploadStager(UploadStager):
|
|
|
140
134
|
**kwargs: Any,
|
|
141
135
|
) -> Path:
|
|
142
136
|
with open(elements_filepath) as elements_file:
|
|
143
|
-
elements_contents
|
|
137
|
+
elements_contents = json.load(elements_file)
|
|
144
138
|
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
145
139
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
146
140
|
|
|
@@ -157,7 +151,7 @@ class SQLUploadStager(UploadStager):
|
|
|
157
151
|
data["id"] = str(uuid.uuid4())
|
|
158
152
|
|
|
159
153
|
# remove extraneous, not supported columns
|
|
160
|
-
data
|
|
154
|
+
[data.pop(column) for column in data if column not in _COLUMNS]
|
|
161
155
|
|
|
162
156
|
output.append(data)
|
|
163
157
|
|
|
@@ -191,32 +185,23 @@ class SQLUploaderConfig(UploaderConfig):
|
|
|
191
185
|
class SQLUploader(Uploader):
|
|
192
186
|
connector_type: str = CONNECTOR_TYPE
|
|
193
187
|
upload_config: SQLUploaderConfig
|
|
194
|
-
connection_config:
|
|
195
|
-
|
|
196
|
-
def precheck(self) -> None:
|
|
197
|
-
try:
|
|
198
|
-
cursor = self.connection().cursor()
|
|
199
|
-
cursor.execute("SELECT 1;")
|
|
200
|
-
cursor.close()
|
|
201
|
-
except Exception as e:
|
|
202
|
-
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
203
|
-
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
188
|
+
connection_config: SimpleSqlConfig
|
|
204
189
|
|
|
205
190
|
@property
|
|
206
|
-
def connection(self)
|
|
191
|
+
def connection(self):
|
|
207
192
|
if self.connection_config.db_type == DatabaseType.POSTGRESQL:
|
|
208
193
|
return self._make_psycopg_connection
|
|
209
194
|
elif self.connection_config.db_type == DatabaseType.SQLITE:
|
|
210
195
|
return self._make_sqlite_connection
|
|
211
196
|
raise ValueError(f"Unsupported database {self.connection_config.db_type} connection.")
|
|
212
197
|
|
|
213
|
-
def _make_sqlite_connection(self)
|
|
198
|
+
def _make_sqlite_connection(self):
|
|
214
199
|
from sqlite3 import connect
|
|
215
200
|
|
|
216
201
|
return connect(database=self.connection_config.database)
|
|
217
202
|
|
|
218
203
|
@requires_dependencies(["psycopg2"], extras="postgres")
|
|
219
|
-
def _make_psycopg_connection(self)
|
|
204
|
+
def _make_psycopg_connection(self):
|
|
220
205
|
from psycopg2 import connect
|
|
221
206
|
|
|
222
207
|
return connect(
|
|
@@ -276,7 +261,7 @@ class SQLUploader(Uploader):
|
|
|
276
261
|
|
|
277
262
|
|
|
278
263
|
sql_destination_entry = DestinationRegistryEntry(
|
|
279
|
-
connection_config=
|
|
264
|
+
connection_config=SimpleSqlConfig,
|
|
280
265
|
uploader=SQLUploader,
|
|
281
266
|
uploader_config=SQLUploaderConfig,
|
|
282
267
|
upload_stager=SQLUploadStager,
|
|
@@ -7,7 +7,6 @@ from typing import TYPE_CHECKING, Any, Optional
|
|
|
7
7
|
from dateutil import parser
|
|
8
8
|
|
|
9
9
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
10
|
-
from unstructured_ingest.error import DestinationConnectionError
|
|
11
10
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
11
|
from unstructured_ingest.v2.interfaces import (
|
|
13
12
|
AccessConfig,
|
|
@@ -157,21 +156,15 @@ class WeaviateUploaderConfig(UploaderConfig):
|
|
|
157
156
|
class WeaviateUploader(Uploader):
|
|
158
157
|
upload_config: WeaviateUploaderConfig
|
|
159
158
|
connection_config: WeaviateConnectionConfig
|
|
159
|
+
client: Optional["Client"] = field(init=False)
|
|
160
160
|
connector_type: str = CONNECTOR_TYPE
|
|
161
161
|
|
|
162
162
|
@requires_dependencies(["weaviate"], extras="weaviate")
|
|
163
|
-
def
|
|
163
|
+
def __post_init__(self):
|
|
164
164
|
from weaviate import Client
|
|
165
165
|
|
|
166
166
|
auth = self._resolve_auth_method()
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
def precheck(self) -> None:
|
|
170
|
-
try:
|
|
171
|
-
self.get_client()
|
|
172
|
-
except Exception as e:
|
|
173
|
-
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
174
|
-
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
167
|
+
self.client = Client(url=self.connection_config.host_url, auth_client_secret=auth)
|
|
175
168
|
|
|
176
169
|
@requires_dependencies(["weaviate"], extras="weaviate")
|
|
177
170
|
def _resolve_auth_method(self):
|
|
@@ -222,9 +215,8 @@ class WeaviateUploader(Uploader):
|
|
|
222
215
|
f"at {self.connection_config.host_url}",
|
|
223
216
|
)
|
|
224
217
|
|
|
225
|
-
client
|
|
226
|
-
client.batch
|
|
227
|
-
with client.batch as b:
|
|
218
|
+
self.client.batch.configure(batch_size=self.upload_config.batch_size)
|
|
219
|
+
with self.client.batch as b:
|
|
228
220
|
for e in elements_dict:
|
|
229
221
|
vector = e.pop("embeddings", None)
|
|
230
222
|
b.add_data_object(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.0.2
|
|
3
|
+
Version: 0.0.2.dev0
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -29,8 +29,8 @@ Requires-Dist: pyairtable ; extra == 'airtable'
|
|
|
29
29
|
Provides-Extra: astra
|
|
30
30
|
Requires-Dist: astrapy ; extra == 'astra'
|
|
31
31
|
Provides-Extra: azure
|
|
32
|
-
Requires-Dist: fsspec ; extra == 'azure'
|
|
33
32
|
Requires-Dist: adlfs ; extra == 'azure'
|
|
33
|
+
Requires-Dist: fsspec ; extra == 'azure'
|
|
34
34
|
Provides-Extra: azure-cognitive-search
|
|
35
35
|
Requires-Dist: azure-search-documents ; extra == 'azure-cognitive-search'
|
|
36
36
|
Provides-Extra: bedrock
|
|
@@ -42,9 +42,9 @@ Provides-Extra: box
|
|
|
42
42
|
Requires-Dist: boxfs ; extra == 'box'
|
|
43
43
|
Requires-Dist: fsspec ; extra == 'box'
|
|
44
44
|
Provides-Extra: chroma
|
|
45
|
-
Requires-Dist: importlib-metadata >=7.1.0 ; extra == 'chroma'
|
|
46
45
|
Requires-Dist: typer <=0.9.0 ; extra == 'chroma'
|
|
47
46
|
Requires-Dist: chromadb ; extra == 'chroma'
|
|
47
|
+
Requires-Dist: importlib-metadata >=7.1.0 ; extra == 'chroma'
|
|
48
48
|
Provides-Extra: clarifai
|
|
49
49
|
Requires-Dist: clarifai ; extra == 'clarifai'
|
|
50
50
|
Provides-Extra: confluence
|
|
@@ -54,8 +54,8 @@ Requires-Dist: unstructured[tsv] ; extra == 'csv'
|
|
|
54
54
|
Provides-Extra: databricks-volumes
|
|
55
55
|
Requires-Dist: databricks-sdk ; extra == 'databricks-volumes'
|
|
56
56
|
Provides-Extra: delta-table
|
|
57
|
-
Requires-Dist: deltalake ; extra == 'delta-table'
|
|
58
57
|
Requires-Dist: fsspec ; extra == 'delta-table'
|
|
58
|
+
Requires-Dist: deltalake ; extra == 'delta-table'
|
|
59
59
|
Provides-Extra: discord
|
|
60
60
|
Requires-Dist: discord-py ; extra == 'discord'
|
|
61
61
|
Provides-Extra: doc
|
|
@@ -63,21 +63,21 @@ Requires-Dist: unstructured[docx] ; extra == 'doc'
|
|
|
63
63
|
Provides-Extra: docx
|
|
64
64
|
Requires-Dist: unstructured[docx] ; extra == 'docx'
|
|
65
65
|
Provides-Extra: dropbox
|
|
66
|
-
Requires-Dist: dropboxdrivefs ; extra == 'dropbox'
|
|
67
66
|
Requires-Dist: fsspec ; extra == 'dropbox'
|
|
67
|
+
Requires-Dist: dropboxdrivefs ; extra == 'dropbox'
|
|
68
68
|
Provides-Extra: elasticsearch
|
|
69
69
|
Requires-Dist: elasticsearch[async] ; extra == 'elasticsearch'
|
|
70
70
|
Provides-Extra: embed-huggingface
|
|
71
|
+
Requires-Dist: langchain-community ; extra == 'embed-huggingface'
|
|
71
72
|
Requires-Dist: huggingface ; extra == 'embed-huggingface'
|
|
72
73
|
Requires-Dist: sentence-transformers ; extra == 'embed-huggingface'
|
|
73
|
-
Requires-Dist: langchain-community ; extra == 'embed-huggingface'
|
|
74
74
|
Provides-Extra: embed-octoai
|
|
75
75
|
Requires-Dist: tiktoken ; extra == 'embed-octoai'
|
|
76
76
|
Requires-Dist: openai ; extra == 'embed-octoai'
|
|
77
77
|
Provides-Extra: embed-vertexai
|
|
78
|
-
Requires-Dist: langchain-community ; extra == 'embed-vertexai'
|
|
79
78
|
Requires-Dist: langchain ; extra == 'embed-vertexai'
|
|
80
79
|
Requires-Dist: langchain-google-vertexai ; extra == 'embed-vertexai'
|
|
80
|
+
Requires-Dist: langchain-community ; extra == 'embed-vertexai'
|
|
81
81
|
Provides-Extra: embed-voyageai
|
|
82
82
|
Requires-Dist: langchain ; extra == 'embed-voyageai'
|
|
83
83
|
Requires-Dist: langchain-voyageai ; extra == 'embed-voyageai'
|
|
@@ -85,8 +85,8 @@ Provides-Extra: epub
|
|
|
85
85
|
Requires-Dist: unstructured[epub] ; extra == 'epub'
|
|
86
86
|
Provides-Extra: gcs
|
|
87
87
|
Requires-Dist: bs4 ; extra == 'gcs'
|
|
88
|
-
Requires-Dist: gcsfs ; extra == 'gcs'
|
|
89
88
|
Requires-Dist: fsspec ; extra == 'gcs'
|
|
89
|
+
Requires-Dist: gcsfs ; extra == 'gcs'
|
|
90
90
|
Provides-Extra: github
|
|
91
91
|
Requires-Dist: pygithub >1.58.0 ; extra == 'github'
|
|
92
92
|
Provides-Extra: gitlab
|
|
@@ -109,25 +109,25 @@ Requires-Dist: pymongo ; extra == 'mongodb'
|
|
|
109
109
|
Provides-Extra: msg
|
|
110
110
|
Requires-Dist: unstructured[msg] ; extra == 'msg'
|
|
111
111
|
Provides-Extra: notion
|
|
112
|
-
Requires-Dist: htmlBuilder ; extra == 'notion'
|
|
113
112
|
Requires-Dist: notion-client ; extra == 'notion'
|
|
113
|
+
Requires-Dist: htmlBuilder ; extra == 'notion'
|
|
114
114
|
Provides-Extra: odt
|
|
115
115
|
Requires-Dist: unstructured[odt] ; extra == 'odt'
|
|
116
116
|
Provides-Extra: onedrive
|
|
117
|
+
Requires-Dist: Office365-REST-Python-Client ; extra == 'onedrive'
|
|
117
118
|
Requires-Dist: msal ; extra == 'onedrive'
|
|
118
119
|
Requires-Dist: bs4 ; extra == 'onedrive'
|
|
119
|
-
Requires-Dist: Office365-REST-Python-Client ; extra == 'onedrive'
|
|
120
120
|
Provides-Extra: openai
|
|
121
|
-
Requires-Dist: tiktoken ; extra == 'openai'
|
|
122
|
-
Requires-Dist: openai ; extra == 'openai'
|
|
123
121
|
Requires-Dist: langchain-community ; extra == 'openai'
|
|
122
|
+
Requires-Dist: openai ; extra == 'openai'
|
|
123
|
+
Requires-Dist: tiktoken ; extra == 'openai'
|
|
124
124
|
Provides-Extra: opensearch
|
|
125
125
|
Requires-Dist: opensearch-py ; extra == 'opensearch'
|
|
126
126
|
Provides-Extra: org
|
|
127
127
|
Requires-Dist: unstructured[org] ; extra == 'org'
|
|
128
128
|
Provides-Extra: outlook
|
|
129
|
-
Requires-Dist: msal ; extra == 'outlook'
|
|
130
129
|
Requires-Dist: Office365-REST-Python-Client ; extra == 'outlook'
|
|
130
|
+
Requires-Dist: msal ; extra == 'outlook'
|
|
131
131
|
Provides-Extra: pdf
|
|
132
132
|
Requires-Dist: unstructured[pdf] ; extra == 'pdf'
|
|
133
133
|
Provides-Extra: pinecone
|
|
@@ -152,11 +152,11 @@ Requires-Dist: fsspec ; extra == 's3'
|
|
|
152
152
|
Provides-Extra: salesforce
|
|
153
153
|
Requires-Dist: simple-salesforce ; extra == 'salesforce'
|
|
154
154
|
Provides-Extra: sftp
|
|
155
|
-
Requires-Dist: paramiko ; extra == 'sftp'
|
|
156
155
|
Requires-Dist: fsspec ; extra == 'sftp'
|
|
156
|
+
Requires-Dist: paramiko ; extra == 'sftp'
|
|
157
157
|
Provides-Extra: sharepoint
|
|
158
|
-
Requires-Dist: msal ; extra == 'sharepoint'
|
|
159
158
|
Requires-Dist: Office365-REST-Python-Client ; extra == 'sharepoint'
|
|
159
|
+
Requires-Dist: msal ; extra == 'sharepoint'
|
|
160
160
|
Provides-Extra: singlestore
|
|
161
161
|
Requires-Dist: singlestoredb ; extra == 'singlestore'
|
|
162
162
|
Provides-Extra: slack
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
2
|
-
unstructured_ingest/__version__.py,sha256=
|
|
2
|
+
unstructured_ingest/__version__.py,sha256=neOBPct_gjgXqs6YN8HnzdaRiPiugEbJpwI6SDZ7qac,47
|
|
3
3
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
4
4
|
unstructured_ingest/evaluate.py,sha256=R-mKLFXbVX1xQ1tjGsLHjdP-TbSSV-925IHzggW_bIg,9793
|
|
5
5
|
unstructured_ingest/interfaces.py,sha256=uS8L5mS0mXD8I4XTfVlKZxAwqnpJ4yrRqn4vxWVRhQI,31107
|
|
@@ -260,10 +260,10 @@ unstructured_ingest/v2/cli/cli.py,sha256=qHXIs-PcvMgDZhP1AR9iDMxh8FXBMJCEDksPBfi
|
|
|
260
260
|
unstructured_ingest/v2/cli/interfaces.py,sha256=4Bbow6QHks2a1H56tmVQ4vG3sZy-577ZbwrPmDfizmE,829
|
|
261
261
|
unstructured_ingest/v2/cli/utils.py,sha256=QK-ee6FzxPf-IbaNXXWlH-GhvqeNnjK2m8ljBD1SusU,9075
|
|
262
262
|
unstructured_ingest/v2/cli/base/__init__.py,sha256=zXCa7F4FMqItmzxfUIVmyI-CeGh8X85yF8lRxwX_OYQ,83
|
|
263
|
-
unstructured_ingest/v2/cli/base/cmd.py,sha256=
|
|
263
|
+
unstructured_ingest/v2/cli/base/cmd.py,sha256=qVHmquVsVDoYyPByKdUTVCwAFfILMYBw5w6eTTVku-E,9308
|
|
264
264
|
unstructured_ingest/v2/cli/base/dest.py,sha256=YMbVIHmYDqvOtxZeEY93stmF2p2ImjuJts7-u-NznYw,2887
|
|
265
265
|
unstructured_ingest/v2/cli/base/importer.py,sha256=nRt0QQ3qpi264-n_mR0l55C2ddM8nowTNzT1jsWaam8,1128
|
|
266
|
-
unstructured_ingest/v2/cli/base/src.py,sha256=
|
|
266
|
+
unstructured_ingest/v2/cli/base/src.py,sha256=7LnZh9FgUX9rerBH6cizVtTWmM6R2sRkxatnGsxYHG0,2410
|
|
267
267
|
unstructured_ingest/v2/cli/cmds/__init__.py,sha256=DWPMD6Wqus22sSoIEyTSiOJAm97aNjvdpdrXgsL4uQ0,2647
|
|
268
268
|
unstructured_ingest/v2/cli/cmds/astra.py,sha256=L-GR2KSP_cFQkQm0aVcdiXmgYMJZCVKIAH794y8qT1M,2590
|
|
269
269
|
unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py,sha256=VTCSUYeIYKnP60lC7DeBYqoqAJnWuBZrwevCXbeIEzw,2248
|
|
@@ -271,7 +271,7 @@ unstructured_ingest/v2/cli/cmds/chroma.py,sha256=RinNOPripk2zRYx1Rt-u-jywXbwh7Js
|
|
|
271
271
|
unstructured_ingest/v2/cli/cmds/databricks_volumes.py,sha256=53d9A7UunJLYZFwwwHEraVshFc3gSzUbmKjMOiv7hn4,5920
|
|
272
272
|
unstructured_ingest/v2/cli/cmds/elasticsearch.py,sha256=joUfnV992fAwEDCtFVJaABwgpyQiWeDl1ZCBEudRtnk,5258
|
|
273
273
|
unstructured_ingest/v2/cli/cmds/google_drive.py,sha256=mXozabpi8kjRFb0S7kw-xMGtEuFoVUxnvefwL5ZIPHc,2334
|
|
274
|
-
unstructured_ingest/v2/cli/cmds/local.py,sha256=
|
|
274
|
+
unstructured_ingest/v2/cli/cmds/local.py,sha256=lGBFOVDRlrcCtPFjyk0IAYHLRWg95Kunu1Kli7t0ZK4,1899
|
|
275
275
|
unstructured_ingest/v2/cli/cmds/milvus.py,sha256=PB1ib1rFGGH_-KDi1bSIO3BIiVcqSJEHCBFFrzQrnmI,1998
|
|
276
276
|
unstructured_ingest/v2/cli/cmds/mongodb.py,sha256=oyV6tacuuxm3dN-AXQgbxvYJiDYo2OOWQKRSBCUGj0E,1823
|
|
277
277
|
unstructured_ingest/v2/cli/cmds/onedrive.py,sha256=DKqhQyyF-swZxs3C9G5W8ECleq8sWpDbpTuiAHXukXQ,2781
|
|
@@ -286,34 +286,32 @@ unstructured_ingest/v2/cli/cmds/fsspec/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JC
|
|
|
286
286
|
unstructured_ingest/v2/cli/cmds/fsspec/azure.py,sha256=ZHfchzSpGkZ99Fq1050JvHP0-aG1pZsBZxxozcFfxwI,2784
|
|
287
287
|
unstructured_ingest/v2/cli/cmds/fsspec/box.py,sha256=kslkI-0-GyGSJOU7bKgrZeQRXh8HFexDq87ew8kT8kE,1338
|
|
288
288
|
unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py,sha256=LtcR3rCQPgzJNbV3S90HlL0LPPbW9lYEfE8BG4F-dSI,1349
|
|
289
|
-
unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py,sha256=
|
|
289
|
+
unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py,sha256=Mgr_nto1FWxnGmbGdVlIfm-xuBGL0HEi8k3FEmQnZng,2414
|
|
290
290
|
unstructured_ingest/v2/cli/cmds/fsspec/gcs.py,sha256=3-0LYnDs0fgNrDqnHpNZKj_6rwNj9wQVaV0lGOhVFPE,2737
|
|
291
291
|
unstructured_ingest/v2/cli/cmds/fsspec/s3.py,sha256=EXQzYkDtkFli2sfcj4cRDRPFac7b7z1DfQqYlGQcE6o,2279
|
|
292
292
|
unstructured_ingest/v2/cli/cmds/fsspec/sftp.py,sha256=YY2xKguawMyLdcG0qDYKUgk7DT0KgyZJlV17MfwIhpo,2036
|
|
293
|
-
unstructured_ingest/v2/cli/configs/__init__.py,sha256=
|
|
293
|
+
unstructured_ingest/v2/cli/configs/__init__.py,sha256=5NMXm872QQZTvUFZFS06c8c1b6K940K5gxs9lbp8W6M,258
|
|
294
294
|
unstructured_ingest/v2/cli/configs/chunk.py,sha256=KvIhmIRIZxazCumMztAKdWs-4MK7qzOb5h6Ned_2bdU,3547
|
|
295
295
|
unstructured_ingest/v2/cli/configs/embed.py,sha256=q_TwnkxKTKOsMgVYfW6xxbD8FWjU_Uh_X2BQ5-_VLGM,2725
|
|
296
|
-
unstructured_ingest/v2/cli/configs/filter.py,sha256=KYe65_au6m7H4VrjgugC2ain6vsUSWswNSEgcG66VPU,841
|
|
297
296
|
unstructured_ingest/v2/cli/configs/partition.py,sha256=7wdI18V6c4kaXuf50Lh66n9LbtrYHYd8ffEgDQLqvSk,3931
|
|
298
297
|
unstructured_ingest/v2/cli/configs/processor.py,sha256=ZHu2DBIuE8VgL3mEt73yYimw2k_PaOEtdxxFqzHfk84,3350
|
|
299
|
-
unstructured_ingest/v2/interfaces/__init__.py,sha256
|
|
298
|
+
unstructured_ingest/v2/interfaces/__init__.py,sha256=-CHWUlT4rISd-gSfcGKGYFqqSFhMY9lKsT5wxwmOThM,845
|
|
300
299
|
unstructured_ingest/v2/interfaces/connector.py,sha256=u4hE1DpTPDC04-n_IzYyn9w1gNCiPT81anrUoEh30Z8,855
|
|
301
|
-
unstructured_ingest/v2/interfaces/downloader.py,sha256=
|
|
302
|
-
unstructured_ingest/v2/interfaces/file_data.py,sha256=
|
|
300
|
+
unstructured_ingest/v2/interfaces/downloader.py,sha256=aWlacZZrI6SGw6retnRJtZbqT5voOYq_fb326ynNOhI,2506
|
|
301
|
+
unstructured_ingest/v2/interfaces/file_data.py,sha256=5TCMkblUW-Jvy-rS5FqRT22VzDmJqAiQRIWYarpAi64,1543
|
|
303
302
|
unstructured_ingest/v2/interfaces/indexer.py,sha256=pMw0abNHk_tEuA4BkXX1BdAfIwHdytxj7s6tGxMvYRE,821
|
|
304
|
-
unstructured_ingest/v2/interfaces/process.py,sha256=
|
|
303
|
+
unstructured_ingest/v2/interfaces/process.py,sha256=0ecz7mAjlY_DUi9-HhPc9zXphmGclispYwv37O8gvJ0,466
|
|
305
304
|
unstructured_ingest/v2/interfaces/processor.py,sha256=uHVHeKo5Gt_zFkaEXw7xgaCBDTEl2-Amh-ByA07258o,1620
|
|
306
305
|
unstructured_ingest/v2/interfaces/upload_stager.py,sha256=SylhDl9pK6qa7hvfrhpabCkjwE03yIlI6oM-mQnqtho,1220
|
|
307
306
|
unstructured_ingest/v2/interfaces/uploader.py,sha256=bzfx3Ei4poXKu-hsgjAB4sj4jKij9CoaRSadUM5LtGk,1083
|
|
308
307
|
unstructured_ingest/v2/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
309
|
-
unstructured_ingest/v2/pipeline/interfaces.py,sha256=
|
|
310
|
-
unstructured_ingest/v2/pipeline/pipeline.py,sha256=
|
|
308
|
+
unstructured_ingest/v2/pipeline/interfaces.py,sha256=Zz76fLHNKw6BDsBSYQXiRa6CvyW91ulvZU0yw5vVQ5M,6544
|
|
309
|
+
unstructured_ingest/v2/pipeline/pipeline.py,sha256=r8jRMZI2RF8GQIuTcjIFBDeFtMnqpOJmKhEriy6Vo5Y,11616
|
|
311
310
|
unstructured_ingest/v2/pipeline/utils.py,sha256=oPAitfdnITqh2O8Z0uf6VOHg9BTJhitRzNmKXqTwPxg,422
|
|
312
311
|
unstructured_ingest/v2/pipeline/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
313
312
|
unstructured_ingest/v2/pipeline/steps/chunk.py,sha256=lfCsBo6A9u1cT57YaEjvNI79gc29nW8c-2_WZNjiO5Y,3275
|
|
314
|
-
unstructured_ingest/v2/pipeline/steps/download.py,sha256=
|
|
313
|
+
unstructured_ingest/v2/pipeline/steps/download.py,sha256=GA5-zTH4c7Ac8oBoc4AeDA8sQ0dYT-KUOZ4n31K3Jpg,4882
|
|
315
314
|
unstructured_ingest/v2/pipeline/steps/embed.py,sha256=VCdDBUXK6Yx8RTvRBpEFdFE7n0izvkP73w6s8Tv2sgg,3253
|
|
316
|
-
unstructured_ingest/v2/pipeline/steps/filter.py,sha256=mYVccl_zp0CGYFWBrSrPelvSElrXhZahebuymGuirV8,1341
|
|
317
315
|
unstructured_ingest/v2/pipeline/steps/index.py,sha256=i4RcJ1oRqNp-rFdc6rvKVGcSzNhdB7woW7_W364uThQ,2269
|
|
318
316
|
unstructured_ingest/v2/pipeline/steps/partition.py,sha256=q7-rpCj5Vy4BXtd7T72gxGb3xg6lmVyNmTwUfHil7Rg,3199
|
|
319
317
|
unstructured_ingest/v2/pipeline/steps/stage.py,sha256=A8i6VAFY4_xFJR0uBEyBNJlQXmTMGaflXsa6Wa6U1wQ,2274
|
|
@@ -323,39 +321,38 @@ unstructured_ingest/v2/processes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
|
|
|
323
321
|
unstructured_ingest/v2/processes/chunker.py,sha256=U6zQhaUG_dii66zqL9iEEGodHENNxnpn6V3pC-e7MMI,4233
|
|
324
322
|
unstructured_ingest/v2/processes/connector_registry.py,sha256=KOrvJNNRdpBPyqFwmTm42kD1xXuo7fNS_5yXjtqAz-c,2100
|
|
325
323
|
unstructured_ingest/v2/processes/embedder.py,sha256=QjAsiXAjWtZzh6lJ4D5LsTMBD81zuMBkegXNWq-FZt0,3308
|
|
326
|
-
unstructured_ingest/v2/processes/filter.py,sha256=CfQihLV_r4yTJgAc66mmbP4_xo3wcDlro5oR_KR--bM,1986
|
|
327
324
|
unstructured_ingest/v2/processes/partitioner.py,sha256=f6UQoQHVKjl8rmM5J9EcuP30RTFLSLrArGdC6qh-ffE,7645
|
|
328
325
|
unstructured_ingest/v2/processes/uncompress.py,sha256=x-JZYNs1zJOtRS7xNgiMyrYoAbzKM0p18O8NAl7avCA,1631
|
|
329
326
|
unstructured_ingest/v2/processes/connectors/__init__.py,sha256=7QMKd8gtEJTIuK352Ho6XyoFvLLhrWIzgdu0dXwXWOE,3960
|
|
330
|
-
unstructured_ingest/v2/processes/connectors/astra.py,sha256=
|
|
331
|
-
unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py,sha256=
|
|
332
|
-
unstructured_ingest/v2/processes/connectors/chroma.py,sha256=
|
|
333
|
-
unstructured_ingest/v2/processes/connectors/databricks_volumes.py,sha256=
|
|
334
|
-
unstructured_ingest/v2/processes/connectors/elasticsearch.py,sha256=
|
|
335
|
-
unstructured_ingest/v2/processes/connectors/google_drive.py,sha256
|
|
336
|
-
unstructured_ingest/v2/processes/connectors/local.py,sha256=
|
|
327
|
+
unstructured_ingest/v2/processes/connectors/astra.py,sha256=TSI_3GHnEh3gYAC30RTG4b2eEB07agroEFmJ38GnQY4,4903
|
|
328
|
+
unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py,sha256=PT02ZKiJuHMrmBClxqBsyDS0aXUQYLVg02Ns2qh1hD4,7935
|
|
329
|
+
unstructured_ingest/v2/processes/connectors/chroma.py,sha256=nYzNz-8oq-DN0Z4r7lHQFmlved76IaYeRvm7-EmbGUE,6998
|
|
330
|
+
unstructured_ingest/v2/processes/connectors/databricks_volumes.py,sha256=MTLK7SvQqWU-PXmEbGajM4f-CqGWlmlfeED6a5StEWw,3226
|
|
331
|
+
unstructured_ingest/v2/processes/connectors/elasticsearch.py,sha256=6QBvVzPk3mWj9ZqJZN7NvhcJaOO6nSLqLwU6zggP59A,14864
|
|
332
|
+
unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=IkLVafUu280OOoqYmdfdfMB6zlpiWjs2Z5J31ZzJOj4,12681
|
|
333
|
+
unstructured_ingest/v2/processes/connectors/local.py,sha256=maAXVKpRRXj_jseC6EPLTosMgw6ll-0lnGsDdAFLWAE,6646
|
|
337
334
|
unstructured_ingest/v2/processes/connectors/milvus.py,sha256=FWH4FH-zns7gh8sITg9pLYE9uKm_3GeOXJ4wjY6PMno,6776
|
|
338
|
-
unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=
|
|
339
|
-
unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=
|
|
335
|
+
unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=ErZWAD-su3OCRGv1h84X1PpAWleUPVZcFDEIYjtyP4E,4310
|
|
336
|
+
unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=WDDoFEfd8M_QBTpkGNI2zZGZZ_CR1rQiCsBWYOO2JoA,8311
|
|
340
337
|
unstructured_ingest/v2/processes/connectors/opensearch.py,sha256=HNRZVQsWnjLLm0yAGiIyHRbhAsBnGSXBO_VkUfIdwdE,5463
|
|
341
|
-
unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=
|
|
342
|
-
unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=
|
|
343
|
-
unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=
|
|
338
|
+
unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=PtAodxemYgiBZESx-g9a8fcL6XagJd9DIDQjrhE8aPk,5746
|
|
339
|
+
unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=Cz4qEtnbsD9-m1DXANxnVRZTHX2ZaUUBPVFPu5wnFRk,10832
|
|
340
|
+
unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=SNovgGUE5tHdfX_lF5zwM_QRZK7mahHzLZKhnqfk6Tc,17696
|
|
344
341
|
unstructured_ingest/v2/processes/connectors/singlestore.py,sha256=upF2O4hJ2uiBhDRrpQ8CSJUvzmqu2j5H1b_QbReHJpw,5168
|
|
345
|
-
unstructured_ingest/v2/processes/connectors/sql.py,sha256=
|
|
342
|
+
unstructured_ingest/v2/processes/connectors/sql.py,sha256=T0rpCbhEipWlezoJOMiUewcZuk6Had6TkmsDT-PeOL0,8360
|
|
346
343
|
unstructured_ingest/v2/processes/connectors/utils.py,sha256=nmpZZCeX0O7rGrwHSWM_heBgpZK9tKT6EV1Moer-z40,576
|
|
347
|
-
unstructured_ingest/v2/processes/connectors/weaviate.py,sha256=
|
|
344
|
+
unstructured_ingest/v2/processes/connectors/weaviate.py,sha256=7H3s44zVKbN3_eR35sbKTKSDOt6ZIIQkX-4t65LuJ6c,8254
|
|
348
345
|
unstructured_ingest/v2/processes/connectors/fsspec/__init__.py,sha256=TtdeImM7Ypl_n6sl7I1JqX6bGSG0t_FqvCqE3Cy24og,1846
|
|
349
346
|
unstructured_ingest/v2/processes/connectors/fsspec/azure.py,sha256=RN7zoifocIWVgoP9aMDMz4TP-Z9KhE-HbCCBq33fY90,4674
|
|
350
347
|
unstructured_ingest/v2/processes/connectors/fsspec/box.py,sha256=UnD-F9g7yOOBStrAqeKq6GuQjEyHdwOA3jYLj8YZIRM,4088
|
|
351
348
|
unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py,sha256=I6mPG9EIso9TcIczCw5Y14Yqd-EhTQ2CLw1MJx1V3dY,4420
|
|
352
|
-
unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=
|
|
349
|
+
unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=MgOUhDGtTAUuzmAsLvBwV_3ggyL5DDpMm-sb4KNck88,12689
|
|
353
350
|
unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=RYZq_8hKF7bRxuB5Gozv5AzB3_nTuuooE4UfRjXwEFU,4443
|
|
354
|
-
unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=
|
|
351
|
+
unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=7lOm5hjb0LBkbe-OWXnV3wDC-3mM_GWwwmdKW0xzh8c,5333
|
|
355
352
|
unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=J7Ej-j7dtXAluHunwynUfHlNsYwymb-LsrGUFcljcsA,5700
|
|
356
353
|
unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
|
|
357
|
-
unstructured_ingest-0.0.2.dist-info/METADATA,sha256=
|
|
358
|
-
unstructured_ingest-0.0.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
359
|
-
unstructured_ingest-0.0.2.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
360
|
-
unstructured_ingest-0.0.2.dist-info/top_level.txt,sha256=QaTxTcjfM5Hr9sZJ6weOJvSe5ESQc0F8AWkhHInTCf8,20
|
|
361
|
-
unstructured_ingest-0.0.2.dist-info/RECORD,,
|
|
354
|
+
unstructured_ingest-0.0.2.dev0.dist-info/METADATA,sha256=dnWewzRLiYQlU6Fglws3oQnUkgzAVCnKM7BPMtls9YU,21573
|
|
355
|
+
unstructured_ingest-0.0.2.dev0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
356
|
+
unstructured_ingest-0.0.2.dev0.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
357
|
+
unstructured_ingest-0.0.2.dev0.dist-info/top_level.txt,sha256=QaTxTcjfM5Hr9sZJ6weOJvSe5ESQc0F8AWkhHInTCf8,20
|
|
358
|
+
unstructured_ingest-0.0.2.dev0.dist-info/RECORD,,
|
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
6
|
-
from unstructured_ingest.v2.cli.utils import DelimitedString
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
@dataclass
|
|
10
|
-
class FilterCliConfig(CliConfig):
|
|
11
|
-
@staticmethod
|
|
12
|
-
def get_cli_options() -> list[click.Option]:
|
|
13
|
-
options = [
|
|
14
|
-
click.Option(
|
|
15
|
-
["--file-glob"],
|
|
16
|
-
default=None,
|
|
17
|
-
type=DelimitedString(),
|
|
18
|
-
help="A comma-separated list of file globs to limit which types of "
|
|
19
|
-
"local files are accepted, e.g. '*.html,*.txt'",
|
|
20
|
-
),
|
|
21
|
-
click.Option(
|
|
22
|
-
["--max-file-size"],
|
|
23
|
-
default=None,
|
|
24
|
-
type=click.IntRange(min=1),
|
|
25
|
-
help="Max file size to process in bytes",
|
|
26
|
-
),
|
|
27
|
-
]
|
|
28
|
-
return options
|
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
import asyncio
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
from typing import Callable, Optional
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
6
|
-
from unstructured_ingest.v2.logger import logger
|
|
7
|
-
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
8
|
-
from unstructured_ingest.v2.pipeline.utils import sterilize_dict
|
|
9
|
-
from unstructured_ingest.v2.processes.filter import Filterer
|
|
10
|
-
|
|
11
|
-
STEP_ID = "filter"
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
@dataclass
|
|
15
|
-
class FilterStep(PipelineStep):
|
|
16
|
-
process: Filterer
|
|
17
|
-
identifier: str = STEP_ID
|
|
18
|
-
|
|
19
|
-
def __post_init__(self):
|
|
20
|
-
config = (
|
|
21
|
-
sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
|
|
22
|
-
if self.process.config
|
|
23
|
-
else None
|
|
24
|
-
)
|
|
25
|
-
logger.info(f"Created {self.identifier} with configs: {config}")
|
|
26
|
-
|
|
27
|
-
async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:
|
|
28
|
-
file_data = FileData.from_file(path=file_data_path)
|
|
29
|
-
fn_kwargs = {"file_data": file_data}
|
|
30
|
-
if not asyncio.iscoroutinefunction(fn):
|
|
31
|
-
resp = fn(**fn_kwargs)
|
|
32
|
-
elif semaphore := self.context.semaphore:
|
|
33
|
-
async with semaphore:
|
|
34
|
-
resp = await fn(**fn_kwargs)
|
|
35
|
-
else:
|
|
36
|
-
resp = await fn(**fn_kwargs)
|
|
37
|
-
|
|
38
|
-
if resp:
|
|
39
|
-
return {"file_data_path": file_data_path}
|
|
40
|
-
return None
|
|
@@ -1,54 +0,0 @@
|
|
|
1
|
-
import fnmatch
|
|
2
|
-
from abc import ABC
|
|
3
|
-
from dataclasses import dataclass, field
|
|
4
|
-
from typing import Any, Callable, Optional
|
|
5
|
-
|
|
6
|
-
from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
|
|
7
|
-
from unstructured_ingest.v2.interfaces import FileData
|
|
8
|
-
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
9
|
-
from unstructured_ingest.v2.logger import logger
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
@dataclass
|
|
13
|
-
class FiltererConfig(EnhancedDataClassJsonMixin):
|
|
14
|
-
file_glob: Optional[list[str]] = None
|
|
15
|
-
max_file_size: Optional[int] = None
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
@dataclass
|
|
19
|
-
class Filterer(BaseProcess, ABC):
|
|
20
|
-
config: FiltererConfig = field(default_factory=lambda: FiltererConfig())
|
|
21
|
-
filters: list[Callable[[FileData], bool]] = field(init=False, default_factory=list)
|
|
22
|
-
|
|
23
|
-
def __post_init__(self):
|
|
24
|
-
# Populate the filters based on values in config
|
|
25
|
-
if self.config.file_glob is not None:
|
|
26
|
-
self.filters.append(self.glob_filter)
|
|
27
|
-
if self.config.max_file_size:
|
|
28
|
-
self.filters.append(self.file_size_filter)
|
|
29
|
-
|
|
30
|
-
def is_async(self) -> bool:
|
|
31
|
-
return False
|
|
32
|
-
|
|
33
|
-
def file_size_filter(self, file_data: FileData) -> bool:
|
|
34
|
-
if filesize_bytes := file_data.metadata.filesize_bytes:
|
|
35
|
-
return filesize_bytes <= self.config.max_file_size
|
|
36
|
-
return True
|
|
37
|
-
|
|
38
|
-
def glob_filter(self, file_data: FileData) -> bool:
|
|
39
|
-
patterns = self.config.file_glob
|
|
40
|
-
path = file_data.source_identifiers.fullpath
|
|
41
|
-
for pattern in patterns:
|
|
42
|
-
if fnmatch.filter([path], pattern):
|
|
43
|
-
return True
|
|
44
|
-
logger.debug(f"The file {path!r} is discarded as it does not match any given glob.")
|
|
45
|
-
return False
|
|
46
|
-
|
|
47
|
-
def run(self, file_data: FileData, **kwargs: Any) -> Optional[FileData]:
|
|
48
|
-
for filter in self.filters:
|
|
49
|
-
if not filter(file_data):
|
|
50
|
-
logger.debug(
|
|
51
|
-
f"filtered out file data due to {filter.__name__}: {file_data.identifier}"
|
|
52
|
-
)
|
|
53
|
-
return None
|
|
54
|
-
return file_data
|
|
File without changes
|
{unstructured_ingest-0.0.2.dist-info → unstructured_ingest-0.0.2.dev0.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{unstructured_ingest-0.0.2.dist-info → unstructured_ingest-0.0.2.dev0.dist-info}/top_level.txt
RENAMED
|
File without changes
|