unstructured-ingest 0.0.14__py3-none-any.whl → 0.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/interfaces.py +1 -1
- unstructured_ingest/cli/utils.py +1 -1
- unstructured_ingest/connector/astradb.py +1 -1
- unstructured_ingest/connector/biomed.py +4 -4
- unstructured_ingest/connector/chroma.py +1 -1
- unstructured_ingest/connector/databricks_volumes.py +2 -2
- unstructured_ingest/connector/fsspec/box.py +1 -1
- unstructured_ingest/connector/fsspec/fsspec.py +5 -5
- unstructured_ingest/connector/git.py +1 -1
- unstructured_ingest/connector/google_drive.py +4 -4
- unstructured_ingest/connector/hubspot.py +1 -1
- unstructured_ingest/connector/kafka.py +8 -8
- unstructured_ingest/connector/local.py +1 -1
- unstructured_ingest/connector/notion/helpers.py +4 -4
- unstructured_ingest/connector/onedrive.py +3 -3
- unstructured_ingest/connector/outlook.py +2 -2
- unstructured_ingest/connector/pinecone.py +1 -1
- unstructured_ingest/connector/sharepoint.py +8 -8
- unstructured_ingest/connector/vectara.py +6 -6
- unstructured_ingest/interfaces.py +4 -4
- unstructured_ingest/logger.py +1 -1
- unstructured_ingest/pipeline/copy.py +1 -1
- unstructured_ingest/pipeline/interfaces.py +2 -2
- unstructured_ingest/pipeline/partition.py +1 -1
- unstructured_ingest/pipeline/pipeline.py +1 -1
- unstructured_ingest/pipeline/reformat/chunking.py +2 -2
- unstructured_ingest/pipeline/reformat/embedding.py +1 -1
- unstructured_ingest/pipeline/source.py +2 -2
- unstructured_ingest/utils/compression.py +3 -3
- unstructured_ingest/utils/string_and_date_utils.py +2 -2
- unstructured_ingest/v2/cli/base/cmd.py +3 -3
- unstructured_ingest/v2/cli/base/dest.py +1 -1
- unstructured_ingest/v2/cli/base/src.py +1 -1
- unstructured_ingest/v2/cli/utils/click.py +1 -1
- unstructured_ingest/v2/interfaces/processor.py +48 -13
- unstructured_ingest/v2/logger.py +1 -1
- unstructured_ingest/v2/otel.py +1 -1
- unstructured_ingest/v2/pipeline/interfaces.py +9 -2
- unstructured_ingest/v2/pipeline/pipeline.py +17 -6
- unstructured_ingest/v2/pipeline/steps/chunk.py +3 -3
- unstructured_ingest/v2/pipeline/steps/download.py +17 -2
- unstructured_ingest/v2/pipeline/steps/embed.py +3 -3
- unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
- unstructured_ingest/v2/pipeline/steps/index.py +2 -2
- unstructured_ingest/v2/pipeline/steps/partition.py +3 -3
- unstructured_ingest/v2/pipeline/steps/stage.py +1 -1
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
- unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +4 -4
- unstructured_ingest/v2/processes/connectors/google_drive.py +1 -1
- unstructured_ingest/v2/processes/connectors/local.py +6 -5
- unstructured_ingest/v2/processes/connectors/milvus.py +1 -1
- unstructured_ingest/v2/processes/connectors/onedrive.py +2 -2
- unstructured_ingest/v2/processes/connectors/opensearch.py +1 -1
- unstructured_ingest/v2/processes/connectors/pinecone.py +2 -2
- unstructured_ingest/v2/processes/connectors/sharepoint.py +9 -5
- unstructured_ingest/v2/processes/filter.py +1 -1
- unstructured_ingest/v2/processes/partitioner.py +3 -3
- unstructured_ingest/v2/utils.py +7 -0
- {unstructured_ingest-0.0.14.dist-info → unstructured_ingest-0.0.15.dist-info}/METADATA +213 -215
- {unstructured_ingest-0.0.14.dist-info → unstructured_ingest-0.0.15.dist-info}/RECORD +69 -69
- unstructured_ingest/evaluate.py +0 -338
- {unstructured_ingest-0.0.14.dist-info → unstructured_ingest-0.0.15.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.0.14.dist-info → unstructured_ingest-0.0.15.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.14.dist-info → unstructured_ingest-0.0.15.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.14.dist-info → unstructured_ingest-0.0.15.dist-info}/top_level.txt +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.0.
|
|
1
|
+
__version__ = "0.0.15" # pragma: no cover
|
unstructured_ingest/cli/utils.py
CHANGED
|
@@ -30,7 +30,7 @@ def extract_config(flat_data: dict, config: t.Type[BaseConfig]) -> BaseConfig:
|
|
|
30
30
|
To be able to extract a nested dataclass from a flat dictionary (as in one coming
|
|
31
31
|
from a click-based options input), the config class is dynamically looked through for
|
|
32
32
|
nested dataclass fields and new nested dictionaries are created to conform to the
|
|
33
|
-
shape the overall class expects
|
|
33
|
+
shape the overall class expects when parsing from a dict. During the process, this will create
|
|
34
34
|
copies of the original dictionary to avoid pruning fields but this isn't a
|
|
35
35
|
problem since the `from_dict()` method ignores unneeded values.
|
|
36
36
|
|
|
@@ -222,7 +222,7 @@ class AstraDBDestinationConnector(BaseDestinationConnector):
|
|
|
222
222
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
223
223
|
|
|
224
224
|
def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
|
225
|
-
logger.info(f"
|
|
225
|
+
logger.info(f"inserting / updating {len(elements_dict)} documents to Astra DB.")
|
|
226
226
|
|
|
227
227
|
astra_db_batch_size = self.write_config.batch_size
|
|
228
228
|
|
|
@@ -123,7 +123,7 @@ class BiomedIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
|
123
123
|
and self.filename.is_file()
|
|
124
124
|
and not self.read_config.download_only
|
|
125
125
|
):
|
|
126
|
-
logger.debug(f"
|
|
126
|
+
logger.debug(f"cleaning up {self}")
|
|
127
127
|
Path.unlink(self.filename)
|
|
128
128
|
|
|
129
129
|
@SourceConnectionError.wrap
|
|
@@ -132,12 +132,12 @@ class BiomedIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
|
132
132
|
download_path = self.file_meta.download_filepath # type: ignore
|
|
133
133
|
dir_ = Path(os.path.dirname(download_path)) # type: ignore
|
|
134
134
|
if not dir_.is_dir():
|
|
135
|
-
logger.debug(f"
|
|
135
|
+
logger.debug(f"creating directory: {dir_}")
|
|
136
136
|
|
|
137
137
|
if dir_:
|
|
138
138
|
dir_.mkdir(parents=True, exist_ok=True)
|
|
139
139
|
self._retrieve()
|
|
140
|
-
logger.debug(f"
|
|
140
|
+
logger.debug(f"file downloaded: {self.file_meta.download_filepath}")
|
|
141
141
|
|
|
142
142
|
@SourceConnectionNetworkError.wrap
|
|
143
143
|
def _retrieve(self):
|
|
@@ -229,7 +229,7 @@ class BiomedSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
|
229
229
|
|
|
230
230
|
def traverse(path, download_dir, output_dir):
|
|
231
231
|
full_path = Path(PMC_DIR) / path
|
|
232
|
-
logger.debug(f"
|
|
232
|
+
logger.debug(f"traversing directory: {full_path}")
|
|
233
233
|
|
|
234
234
|
ftp = FTP(DOMAIN)
|
|
235
235
|
ftp.login()
|
|
@@ -139,7 +139,7 @@ class ChromaDestinationConnector(BaseDestinationConnector):
|
|
|
139
139
|
return chroma_dict
|
|
140
140
|
|
|
141
141
|
def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
|
142
|
-
logger.info(f"
|
|
142
|
+
logger.info(f"inserting / updating {len(elements_dict)} documents to destination ")
|
|
143
143
|
|
|
144
144
|
chroma_batch_size = self.write_config.batch_size
|
|
145
145
|
|
|
@@ -112,10 +112,10 @@ class DatabricksVolumesDestinationConnector(BaseDestinationConnector):
|
|
|
112
112
|
**kwargs,
|
|
113
113
|
) -> None:
|
|
114
114
|
output_folder = self.write_config.path
|
|
115
|
-
output_folder = os.path.join(output_folder) # Make sure folder ends with file
|
|
115
|
+
output_folder = os.path.join(output_folder) # Make sure folder ends with file separator
|
|
116
116
|
filename = (
|
|
117
117
|
filename.strip(os.sep) if filename else filename
|
|
118
|
-
) # Make sure filename doesn't begin with file
|
|
118
|
+
) # Make sure filename doesn't begin with file separator
|
|
119
119
|
output_path = str(PurePath(output_folder, filename)) if filename else output_folder
|
|
120
120
|
logger.debug(f"uploading content to {output_path}")
|
|
121
121
|
self.client.files.upload(
|
|
@@ -44,7 +44,7 @@ class SimpleBoxConfig(SimpleFsspecConfig):
|
|
|
44
44
|
|
|
45
45
|
@requires_dependencies(["boxfs"], extras="box")
|
|
46
46
|
def get_access_config(self) -> dict:
|
|
47
|
-
# Return access_kwargs with oauth. The oauth object
|
|
47
|
+
# Return access_kwargs with oauth. The oauth object cannot be stored directly in the config
|
|
48
48
|
# because it is not serializable.
|
|
49
49
|
from boxsdk import JWTAuth
|
|
50
50
|
|
|
@@ -221,12 +221,12 @@ class FsspecSourceConnector(
|
|
|
221
221
|
for pattern in patterns:
|
|
222
222
|
if fnmatch.filter([path], pattern):
|
|
223
223
|
return True
|
|
224
|
-
logger.debug(f"
|
|
224
|
+
logger.debug(f"the file {path!r} is discarded as it does not match any given glob.")
|
|
225
225
|
return False
|
|
226
226
|
|
|
227
227
|
def get_ingest_docs(self):
|
|
228
228
|
raw_files = self._list_files()
|
|
229
|
-
# If glob filters provided, use to
|
|
229
|
+
# If glob filters provided, use to filter on filepaths
|
|
230
230
|
files = [f for f in raw_files if self.does_path_match_glob(f)]
|
|
231
231
|
# remove compressed files
|
|
232
232
|
compressed_file_ext = TAR_FILE_EXT + ZIP_FILE_EXT
|
|
@@ -328,13 +328,13 @@ class FsspecDestinationConnector(BaseDestinationConnector):
|
|
|
328
328
|
**self.connector_config.get_access_config(),
|
|
329
329
|
)
|
|
330
330
|
|
|
331
|
-
logger.info(f"
|
|
331
|
+
logger.info(f"writing content using filesystem: {type(fs).__name__}")
|
|
332
332
|
|
|
333
333
|
output_folder = self.connector_config.path_without_protocol
|
|
334
|
-
output_folder = os.path.join(output_folder) # Make sure folder ends with file
|
|
334
|
+
output_folder = os.path.join(output_folder) # Make sure folder ends with file separator
|
|
335
335
|
filename = (
|
|
336
336
|
filename.strip(os.sep) if filename else filename
|
|
337
|
-
) # Make sure filename doesn't begin with file
|
|
337
|
+
) # Make sure filename doesn't begin with file separator
|
|
338
338
|
output_path = str(PurePath(output_folder, filename)) if filename else output_folder
|
|
339
339
|
full_output_path = f"{self.connector_config.protocol}://{output_path}"
|
|
340
340
|
logger.debug(f"uploading content to {full_output_path}")
|
|
@@ -120,5 +120,5 @@ class GitSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
|
120
120
|
for pattern in patterns:
|
|
121
121
|
if fnmatch.filter([path], pattern):
|
|
122
122
|
return True
|
|
123
|
-
logger.debug(f"
|
|
123
|
+
logger.debug(f"the file {path!r} is discarded as it does not match any given glob.")
|
|
124
124
|
return False
|
|
@@ -222,7 +222,7 @@ class GoogleDriveIngestDoc(IngestDocSessionHandleMixin, IngestDocCleanupMixin, B
|
|
|
222
222
|
dir_ = Path(self.meta["download_dir"])
|
|
223
223
|
if dir_:
|
|
224
224
|
if not dir_.is_dir():
|
|
225
|
-
logger.debug(f"
|
|
225
|
+
logger.debug(f"creating directory: {self.meta.get('download_dir')}")
|
|
226
226
|
|
|
227
227
|
if dir_:
|
|
228
228
|
dir_.mkdir(parents=True, exist_ok=True)
|
|
@@ -230,7 +230,7 @@ class GoogleDriveIngestDoc(IngestDocSessionHandleMixin, IngestDocCleanupMixin, B
|
|
|
230
230
|
with open(self.filename, "wb") as handler:
|
|
231
231
|
handler.write(file.getbuffer())
|
|
232
232
|
saved = True
|
|
233
|
-
logger.debug(f"
|
|
233
|
+
logger.debug(f"file downloaded: {self.filename}.")
|
|
234
234
|
if not saved:
|
|
235
235
|
logger.error(f"Error while downloading and saving file: {self.filename}.")
|
|
236
236
|
|
|
@@ -241,7 +241,7 @@ class GoogleDriveIngestDoc(IngestDocSessionHandleMixin, IngestDocCleanupMixin, B
|
|
|
241
241
|
self._output_filename.parent.mkdir(parents=True, exist_ok=True)
|
|
242
242
|
with open(self._output_filename, "w") as output_f:
|
|
243
243
|
output_f.write(json.dumps(self.isd_elems_no_filename, ensure_ascii=False, indent=2))
|
|
244
|
-
logger.info(f"
|
|
244
|
+
logger.info(f"wrote {self._output_filename}")
|
|
245
245
|
|
|
246
246
|
|
|
247
247
|
@dataclass
|
|
@@ -295,7 +295,7 @@ class GoogleDriveSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnecto
|
|
|
295
295
|
guess = guess_extension(export_mime)
|
|
296
296
|
ext = guess if guess else ext
|
|
297
297
|
|
|
298
|
-
# TODO
|
|
298
|
+
# TODO(Habeeb): Consider filtering at the query level.
|
|
299
299
|
if (
|
|
300
300
|
self.connector_config.extension
|
|
301
301
|
and self.connector_config.extension != ext
|
|
@@ -271,7 +271,7 @@ class HubSpotSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
|
271
271
|
|
|
272
272
|
ingest_docs: t.List[HubSpotIngestDoc] = []
|
|
273
273
|
for obj_name, obj_method in obj_method_resolver.items():
|
|
274
|
-
logger.info(f"
|
|
274
|
+
logger.info(f"retrieving - {obj_name}")
|
|
275
275
|
results: t.List[HubSpotIngestDoc] = obj_method() # type: ignore
|
|
276
276
|
ingest_docs += results # type: ignore
|
|
277
277
|
|
|
@@ -114,7 +114,7 @@ class KafkaSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
|
114
114
|
|
|
115
115
|
def initialize(self):
|
|
116
116
|
topic = self.connector_config.topic
|
|
117
|
-
logger.info(f"
|
|
117
|
+
logger.info(f"subscribing to topic: {topic}")
|
|
118
118
|
self.kafka_consumer.subscribe([topic])
|
|
119
119
|
|
|
120
120
|
@property
|
|
@@ -149,7 +149,7 @@ class KafkaSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
|
149
149
|
conf["sasl.password"] = secret
|
|
150
150
|
|
|
151
151
|
consumer = Consumer(conf)
|
|
152
|
-
logger.debug(f"
|
|
152
|
+
logger.debug(f"kafka consumer connected to bootstrap: {bootstrap}")
|
|
153
153
|
return consumer
|
|
154
154
|
|
|
155
155
|
@SourceConnectionError.wrap
|
|
@@ -161,7 +161,7 @@ class KafkaSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
|
161
161
|
|
|
162
162
|
collected = []
|
|
163
163
|
num_messages_to_consume = self.connector_config.num_messages_to_consume
|
|
164
|
-
logger.info(f"
|
|
164
|
+
logger.info(f"config set for blocking on {num_messages_to_consume} messages")
|
|
165
165
|
# Consume specified number of messages
|
|
166
166
|
while running:
|
|
167
167
|
msg = consumer.poll(timeout=self.connector_config.timeout)
|
|
@@ -178,7 +178,7 @@ class KafkaSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
|
178
178
|
else:
|
|
179
179
|
collected.append(json.loads(msg.value().decode("utf8")))
|
|
180
180
|
if len(collected) >= num_messages_to_consume:
|
|
181
|
-
logger.debug(f"
|
|
181
|
+
logger.debug(f"found {len(collected)} messages, stopping")
|
|
182
182
|
consumer.commit(asynchronous=False)
|
|
183
183
|
break
|
|
184
184
|
|
|
@@ -243,7 +243,7 @@ class KafkaDestinationConnector(IngestDocSessionHandleMixin, BaseDestinationConn
|
|
|
243
243
|
conf["sasl.password"] = secret
|
|
244
244
|
|
|
245
245
|
producer = Producer(conf)
|
|
246
|
-
logger.debug(f"
|
|
246
|
+
logger.debug(f"connected to bootstrap: {bootstrap}")
|
|
247
247
|
return producer
|
|
248
248
|
|
|
249
249
|
def check_connection(self):
|
|
@@ -255,7 +255,7 @@ class KafkaDestinationConnector(IngestDocSessionHandleMixin, BaseDestinationConn
|
|
|
255
255
|
|
|
256
256
|
@DestinationConnectionError.wrap
|
|
257
257
|
def upload_msg(self, batch) -> int:
|
|
258
|
-
logger.debug(f"
|
|
258
|
+
logger.debug(f"uploading batch: {batch}")
|
|
259
259
|
topic = self.connector_config.topic
|
|
260
260
|
producer = self.kafka_producer
|
|
261
261
|
uploaded = 0
|
|
@@ -267,7 +267,7 @@ class KafkaDestinationConnector(IngestDocSessionHandleMixin, BaseDestinationConn
|
|
|
267
267
|
|
|
268
268
|
@DestinationConnectionError.wrap
|
|
269
269
|
def write_dict(self, *args, dict_list: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
|
270
|
-
logger.info(f"
|
|
270
|
+
logger.info(f"writing {len(dict_list)} documents to Kafka")
|
|
271
271
|
num_uploaded = 0
|
|
272
272
|
|
|
273
273
|
for chunk in batch_generator(dict_list, self.write_config.batch_size):
|
|
@@ -275,7 +275,7 @@ class KafkaDestinationConnector(IngestDocSessionHandleMixin, BaseDestinationConn
|
|
|
275
275
|
|
|
276
276
|
producer = self.kafka_producer
|
|
277
277
|
producer.flush()
|
|
278
|
-
logger.info(f"
|
|
278
|
+
logger.info(f"uploaded {num_uploaded} documents to Kafka")
|
|
279
279
|
|
|
280
280
|
def write(self, docs: t.List[BaseIngestDoc]) -> None:
|
|
281
281
|
content_list: t.List[t.Dict[str, t.Any]] = []
|
|
@@ -123,7 +123,7 @@ class LocalSourceConnector(BaseSourceConnector):
|
|
|
123
123
|
for pattern in patterns:
|
|
124
124
|
if fnmatch.filter([path], pattern):
|
|
125
125
|
return True
|
|
126
|
-
logger.debug(f"
|
|
126
|
+
logger.debug(f"the file {path!r} is discarded as it does not match any given glob.")
|
|
127
127
|
return False
|
|
128
128
|
|
|
129
129
|
def get_ingest_docs(self):
|
|
@@ -103,7 +103,7 @@ def extract_page_html(
|
|
|
103
103
|
):
|
|
104
104
|
children.extend(children_block)
|
|
105
105
|
if children:
|
|
106
|
-
logger.debug(f"
|
|
106
|
+
logger.debug(f"adding {len(children)} children from parent: {parent}")
|
|
107
107
|
for child in children:
|
|
108
108
|
if child.id not in processed_block_ids:
|
|
109
109
|
parents.append((level + 1, child))
|
|
@@ -159,7 +159,7 @@ def extract_database_html(
|
|
|
159
159
|
for page_chunk in client.databases.iterate_query(database_id=database_id): # type: ignore
|
|
160
160
|
all_pages.extend(page_chunk)
|
|
161
161
|
|
|
162
|
-
logger.debug(f"
|
|
162
|
+
logger.debug(f"creating {len(all_pages)} rows")
|
|
163
163
|
for page in all_pages:
|
|
164
164
|
if is_database_url(client=client, url=page.url):
|
|
165
165
|
child_databases.append(page.id)
|
|
@@ -237,7 +237,7 @@ def get_recursive_content(
|
|
|
237
237
|
parent: QueueEntry = parents.pop()
|
|
238
238
|
processed.append(str(parent.id))
|
|
239
239
|
if parent.type == QueueEntryType.PAGE:
|
|
240
|
-
logger.debug(f"
|
|
240
|
+
logger.debug(f"getting child data from page: {parent.id}")
|
|
241
241
|
page_children = []
|
|
242
242
|
try:
|
|
243
243
|
for children_block in client.blocks.children.iterate_list( # type: ignore
|
|
@@ -316,7 +316,7 @@ def get_recursive_content(
|
|
|
316
316
|
)
|
|
317
317
|
|
|
318
318
|
elif parent.type == QueueEntryType.DATABASE:
|
|
319
|
-
logger.debug(f"
|
|
319
|
+
logger.debug(f"getting child data from database: {parent.id}")
|
|
320
320
|
database_pages = []
|
|
321
321
|
try:
|
|
322
322
|
for page_entries in client.databases.iterate_query( # type: ignore
|
|
@@ -157,17 +157,17 @@ class OneDriveIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
|
157
157
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
158
158
|
|
|
159
159
|
if not self.download_dir.is_dir():
|
|
160
|
-
logger.debug(f"
|
|
160
|
+
logger.debug(f"creating directory: {self.download_dir}")
|
|
161
161
|
self.download_dir.mkdir(parents=True, exist_ok=True)
|
|
162
162
|
|
|
163
163
|
if fsize > MAX_MB_SIZE:
|
|
164
|
-
logger.info(f"
|
|
164
|
+
logger.info(f"downloading file with size: {fsize} bytes in chunks")
|
|
165
165
|
with self.filename.open(mode="wb") as f:
|
|
166
166
|
file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query()
|
|
167
167
|
else:
|
|
168
168
|
with self.filename.open(mode="wb") as f:
|
|
169
169
|
file.download(f).execute_query()
|
|
170
|
-
logger.info(f"
|
|
170
|
+
logger.info(f"file downloaded: {self.filename}")
|
|
171
171
|
return
|
|
172
172
|
|
|
173
173
|
|
|
@@ -164,7 +164,7 @@ class OutlookIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
|
164
164
|
self.connector_config._get_client()
|
|
165
165
|
self.update_source_metadata()
|
|
166
166
|
if not self.download_dir.is_dir():
|
|
167
|
-
logger.debug(f"
|
|
167
|
+
logger.debug(f"creating directory: {self.download_dir}")
|
|
168
168
|
self.download_dir.mkdir(parents=True, exist_ok=True)
|
|
169
169
|
|
|
170
170
|
with open(
|
|
@@ -182,7 +182,7 @@ class OutlookIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
|
182
182
|
)
|
|
183
183
|
logger.error(e)
|
|
184
184
|
return
|
|
185
|
-
logger.info(f"
|
|
185
|
+
logger.info(f"file downloaded: {self.hash_mail_name(self.message_id)}")
|
|
186
186
|
return
|
|
187
187
|
|
|
188
188
|
|
|
@@ -80,7 +80,7 @@ class PineconeDestinationConnector(IngestDocSessionHandleMixin, BaseDestinationC
|
|
|
80
80
|
)
|
|
81
81
|
|
|
82
82
|
index = pc.Index(self.connector_config.index_name)
|
|
83
|
-
logger.debug(f"
|
|
83
|
+
logger.debug(f"connected to index: {pc.describe_index(self.connector_config.index_name)}")
|
|
84
84
|
return index
|
|
85
85
|
|
|
86
86
|
@DestinationConnectionError.wrap
|
|
@@ -253,11 +253,11 @@ class SharepointIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
|
253
253
|
|
|
254
254
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
255
255
|
if not self.download_dir.is_dir():
|
|
256
|
-
logger.debug(f"
|
|
256
|
+
logger.debug(f"creating directory: {self.download_dir}")
|
|
257
257
|
self.download_dir.mkdir(parents=True, exist_ok=True)
|
|
258
258
|
with self.filename.open(mode="w") as f:
|
|
259
259
|
f.write(pld)
|
|
260
|
-
logger.info(f"
|
|
260
|
+
logger.info(f"file downloaded: {self.filename}")
|
|
261
261
|
|
|
262
262
|
def _download_file(self):
|
|
263
263
|
file = self._fetch_file()
|
|
@@ -266,17 +266,17 @@ class SharepointIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
|
266
266
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
267
267
|
|
|
268
268
|
if not self.download_dir.is_dir():
|
|
269
|
-
logger.debug(f"
|
|
269
|
+
logger.debug(f"creating directory: {self.download_dir}")
|
|
270
270
|
self.download_dir.mkdir(parents=True, exist_ok=True)
|
|
271
271
|
|
|
272
272
|
if fsize > MAX_MB_SIZE:
|
|
273
|
-
logger.info(f"
|
|
273
|
+
logger.info(f"downloading file with size: {fsize} bytes in chunks")
|
|
274
274
|
with self.filename.open(mode="wb") as f:
|
|
275
275
|
file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query()
|
|
276
276
|
else:
|
|
277
277
|
with self.filename.open(mode="wb") as f:
|
|
278
278
|
file.download(f).execute_query()
|
|
279
|
-
logger.info(f"
|
|
279
|
+
logger.info(f"file downloaded: {self.filename}")
|
|
280
280
|
|
|
281
281
|
@BaseSingleIngestDoc.skip_if_file_exists
|
|
282
282
|
@SourceConnectionError.wrap
|
|
@@ -374,7 +374,7 @@ class SharepointSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector
|
|
|
374
374
|
if self.connector_config.process_pages:
|
|
375
375
|
page_output = self._list_pages(site_client)
|
|
376
376
|
if not page_output:
|
|
377
|
-
logger.info(f"
|
|
377
|
+
logger.info(f"couldn't process pages for site {site_client.base_url}")
|
|
378
378
|
output = output + page_output
|
|
379
379
|
return output
|
|
380
380
|
|
|
@@ -404,7 +404,7 @@ class SharepointSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector
|
|
|
404
404
|
tenant_sites = {s.url for s in tenant_sites if (s.url is not None)}
|
|
405
405
|
ingest_docs: t.List[SharepointIngestDoc] = []
|
|
406
406
|
for site_url in tenant_sites:
|
|
407
|
-
logger.info(f"
|
|
407
|
+
logger.info(f"processing docs for site: {site_url}")
|
|
408
408
|
site_client = self.connector_config.get_site_client(site_url)
|
|
409
409
|
ingest_docs = ingest_docs + self._ingest_site_docs(site_client)
|
|
410
410
|
return ingest_docs
|
|
@@ -440,7 +440,7 @@ class SharepointPermissionsConnector:
|
|
|
440
440
|
if response.status_code == 200:
|
|
441
441
|
return response.json()
|
|
442
442
|
else:
|
|
443
|
-
logger.info(f"
|
|
443
|
+
logger.info(f"request failed with status code {response.status_code}:")
|
|
444
444
|
logger.info(response.text)
|
|
445
445
|
|
|
446
446
|
@requires_dependencies(["requests"], extras="sharepoint")
|
|
@@ -181,7 +181,7 @@ class VectaraDestinationConnector(BaseDestinationConnector):
|
|
|
181
181
|
try:
|
|
182
182
|
result = self._request(endpoint="index", data=body, http_method="POST")
|
|
183
183
|
except Exception as e:
|
|
184
|
-
logger.info(f"
|
|
184
|
+
logger.info(f"exception {e} while indexing document {document['documentId']}")
|
|
185
185
|
return
|
|
186
186
|
|
|
187
187
|
if (
|
|
@@ -196,18 +196,18 @@ class VectaraDestinationConnector(BaseDestinationConnector):
|
|
|
196
196
|
)
|
|
197
197
|
)
|
|
198
198
|
):
|
|
199
|
-
logger.info(f"
|
|
199
|
+
logger.info(f"document {document['documentId']} already exists, re-indexing")
|
|
200
200
|
self._delete_doc(document["documentId"])
|
|
201
201
|
result = self._request(endpoint="index", data=body, http_method="POST")
|
|
202
202
|
return
|
|
203
203
|
|
|
204
204
|
if "status" in result and result["status"] and "OK" in result["status"]["code"]:
|
|
205
|
-
logger.info(f"
|
|
205
|
+
logger.info(f"indexing document {document['documentId']} succeeded")
|
|
206
206
|
else:
|
|
207
|
-
logger.info(f"
|
|
207
|
+
logger.info(f"indexing document {document['documentId']} failed, response = {result}")
|
|
208
208
|
|
|
209
209
|
def write_dict(self, *args, docs_list: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
|
210
|
-
logger.info(f"
|
|
210
|
+
logger.info(f"inserting / updating {len(docs_list)} documents to Vectara ")
|
|
211
211
|
for vdoc in docs_list:
|
|
212
212
|
self._index_document(vdoc)
|
|
213
213
|
|
|
@@ -216,7 +216,7 @@ class VectaraDestinationConnector(BaseDestinationConnector):
|
|
|
216
216
|
|
|
217
217
|
def get_metadata(element) -> t.Dict[str, t.Any]:
|
|
218
218
|
"""
|
|
219
|
-
Select which meta-data fields to include and
|
|
219
|
+
Select which meta-data fields to include and optionally map them to a new new.
|
|
220
220
|
remove the "metadata-" prefix from the keys
|
|
221
221
|
"""
|
|
222
222
|
metadata_map = {
|
|
@@ -529,7 +529,7 @@ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
|
|
|
529
529
|
and self.filename.is_file()
|
|
530
530
|
and self.filename.stat().st_size
|
|
531
531
|
):
|
|
532
|
-
logger.debug(f"
|
|
532
|
+
logger.debug(f"file exists: {self.filename}, skipping {func.__name__}")
|
|
533
533
|
return None
|
|
534
534
|
return func(self, *args, **kwargs)
|
|
535
535
|
|
|
@@ -586,7 +586,7 @@ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
|
|
|
586
586
|
|
|
587
587
|
endpoint = partition_config.partition_endpoint
|
|
588
588
|
|
|
589
|
-
logger.debug(f"
|
|
589
|
+
logger.debug(f"using remote partition ({endpoint})")
|
|
590
590
|
|
|
591
591
|
elements = partition_via_api(
|
|
592
592
|
filename=str(self.filename),
|
|
@@ -606,7 +606,7 @@ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
|
|
|
606
606
|
self._date_processed = datetime.utcnow().isoformat()
|
|
607
607
|
if self.read_config.download_only:
|
|
608
608
|
return None
|
|
609
|
-
logger.info(f"
|
|
609
|
+
logger.info(f"processing {self.filename}")
|
|
610
610
|
|
|
611
611
|
elements = self.partition_file(partition_config=partition_config, **partition_kwargs)
|
|
612
612
|
element_dicts = [e.to_dict() for e in elements]
|
|
@@ -824,7 +824,7 @@ class IngestDocCleanupMixin:
|
|
|
824
824
|
and self.filename.is_file()
|
|
825
825
|
and not self.read_config.download_only
|
|
826
826
|
):
|
|
827
|
-
logger.debug(f"
|
|
827
|
+
logger.debug(f"cleaning up {self}")
|
|
828
828
|
os.unlink(self.filename)
|
|
829
829
|
|
|
830
830
|
|
unstructured_ingest/logger.py
CHANGED
|
@@ -95,7 +95,7 @@ class SensitiveFormatter(logging.Formatter):
|
|
|
95
95
|
|
|
96
96
|
|
|
97
97
|
def remove_root_handlers(logger: logging.Logger) -> None:
|
|
98
|
-
# NOTE(robinson)
|
|
98
|
+
# NOTE(robinson): in some environments such as Google Colab, there is a root handler
|
|
99
99
|
# that doesn't not mask secrets, meaning sensitive info such as api keys appear in logs.
|
|
100
100
|
# Removing these when they exist prevents this behavior
|
|
101
101
|
if logger.root.hasHandlers():
|
|
@@ -15,5 +15,5 @@ class Copier(CopyNode):
|
|
|
15
15
|
ingest_doc = create_ingest_doc_from_dict(ingest_doc_dict)
|
|
16
16
|
desired_output = ingest_doc._output_filename
|
|
17
17
|
Path(desired_output).parent.mkdir(parents=True, exist_ok=True)
|
|
18
|
-
logger.info(f"
|
|
18
|
+
logger.info(f"copying {json_path} -> {desired_output}")
|
|
19
19
|
shutil.copy(json_path, desired_output)
|
|
@@ -57,7 +57,7 @@ class PipelineNode(DataClassJsonMixin, ABC):
|
|
|
57
57
|
iterable = iterable if iterable else []
|
|
58
58
|
if iterable:
|
|
59
59
|
logger.info(
|
|
60
|
-
f"
|
|
60
|
+
f"calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore
|
|
61
61
|
)
|
|
62
62
|
|
|
63
63
|
self.initialize()
|
|
@@ -92,7 +92,7 @@ class PipelineNode(DataClassJsonMixin, ABC):
|
|
|
92
92
|
|
|
93
93
|
def initialize(self):
|
|
94
94
|
if path := self.get_path():
|
|
95
|
-
logger.info(f"
|
|
95
|
+
logger.info(f"creating {path}")
|
|
96
96
|
path.mkdir(parents=True, exist_ok=True)
|
|
97
97
|
ingest_log_streaming_init(logging.DEBUG if self.pipeline_context.verbose else logging.INFO)
|
|
98
98
|
|
|
@@ -30,7 +30,7 @@ class Partitioner(PartitionNode):
|
|
|
30
30
|
and json_path.is_file()
|
|
31
31
|
and json_path.stat().st_size
|
|
32
32
|
):
|
|
33
|
-
logger.info(f"
|
|
33
|
+
logger.info(f"file exists: {json_path}, skipping partition")
|
|
34
34
|
return str(json_path)
|
|
35
35
|
partition_kwargs: t.Dict[str, t.Any] = {
|
|
36
36
|
"strategy": self.partition_config.strategy,
|
|
@@ -96,7 +96,7 @@ class Pipeline(DataClassJsonMixin):
|
|
|
96
96
|
for reformat_node in self.reformat_nodes:
|
|
97
97
|
reformatted_jsons = reformat_node(iterable=partitioned_jsons)
|
|
98
98
|
if not reformatted_jsons:
|
|
99
|
-
logger.info(f"
|
|
99
|
+
logger.info(f"no files to process after {reformat_node.__class__.__name__}")
|
|
100
100
|
return
|
|
101
101
|
partitioned_jsons = reformatted_jsons
|
|
102
102
|
|
|
@@ -58,7 +58,7 @@ class Chunker(ReformatNode):
|
|
|
58
58
|
and json_path.is_file()
|
|
59
59
|
and json_path.stat().st_size
|
|
60
60
|
):
|
|
61
|
-
logger.debug(f"
|
|
61
|
+
logger.debug(f"file exists: {json_path}, skipping chunking")
|
|
62
62
|
return str(json_path)
|
|
63
63
|
|
|
64
64
|
chunked_elements = self.chunk(elements_json)
|
|
@@ -112,7 +112,7 @@ class Chunker(ReformatNode):
|
|
|
112
112
|
|
|
113
113
|
return partition_via_api(
|
|
114
114
|
filename=elements_json_file,
|
|
115
|
-
# -- (jennings) If api_key or api_url are None, partition_via_api will raise an
|
|
115
|
+
# -- NOTE(jennings): If api_key or api_url are None, partition_via_api will raise an
|
|
116
116
|
# -- error, which will be caught and logged by Chunker.run()
|
|
117
117
|
api_key=self.partition_config.api_key, # type: ignore
|
|
118
118
|
api_url=self.partition_config.partition_endpoint, # type: ignore
|
|
@@ -44,7 +44,7 @@ class Embedder(ReformatNode):
|
|
|
44
44
|
and json_path.is_file()
|
|
45
45
|
and json_path.stat().st_size
|
|
46
46
|
):
|
|
47
|
-
logger.debug(f"
|
|
47
|
+
logger.debug(f"file exists: {json_path}, skipping embedding")
|
|
48
48
|
return str(json_path)
|
|
49
49
|
with open(elements_json) as f:
|
|
50
50
|
elements = json.load(f)
|
|
@@ -24,12 +24,12 @@ class Reader(SourceNode):
|
|
|
24
24
|
and doc.filename.is_file()
|
|
25
25
|
and doc.filename.stat().st_size
|
|
26
26
|
):
|
|
27
|
-
logger.info(f"
|
|
27
|
+
logger.info(f"file exists: {doc.filename}, skipping download")
|
|
28
28
|
# Still need to fetch metadata if file exists locally
|
|
29
29
|
doc.update_source_metadata()
|
|
30
30
|
else:
|
|
31
31
|
serialized_doc = doc.to_json(redact_sensitive=True)
|
|
32
|
-
logger.debug(f"
|
|
32
|
+
logger.debug(f"fetching {serialized_doc} - PID: {os.getpid()}")
|
|
33
33
|
if self.retry_strategy:
|
|
34
34
|
self.retry_strategy(doc.get_file)
|
|
35
35
|
else:
|
|
@@ -22,7 +22,7 @@ TAR_FILE_EXT = [".tar", ".tar.gz", ".tgz"]
|
|
|
22
22
|
|
|
23
23
|
def uncompress_file(filename: str, path: Optional[str] = None) -> str:
|
|
24
24
|
"""
|
|
25
|
-
Takes in a compressed zip or tar file and
|
|
25
|
+
Takes in a compressed zip or tar file and decompresses it
|
|
26
26
|
"""
|
|
27
27
|
# Create path if it doesn't already exist
|
|
28
28
|
if path:
|
|
@@ -65,7 +65,7 @@ def uncompress_tar_file(tar_filename: str, path: Optional[str] = None) -> str:
|
|
|
65
65
|
logger.info(f"extracting tar {tar_filename} -> {path}")
|
|
66
66
|
# NOTE: "r:*" mode opens both compressed (e.g ".tar.gz") and uncompressed ".tar" archives
|
|
67
67
|
with tarfile.open(tar_filename, "r:*") as tfile:
|
|
68
|
-
# NOTE(robinson: Mitigate against malicious content being extracted from the tar file.
|
|
68
|
+
# NOTE(robinson): Mitigate against malicious content being extracted from the tar file.
|
|
69
69
|
# This was added in Python 3.12
|
|
70
70
|
# Ref: https://docs.python.org/3/library/tarfile.html#extraction-filters
|
|
71
71
|
if sys.version_info >= (3, 12):
|
|
@@ -113,6 +113,6 @@ class CompressionSourceConnectorMixin:
|
|
|
113
113
|
read_config=new_read_configs,
|
|
114
114
|
processor_config=new_process_configs,
|
|
115
115
|
)
|
|
116
|
-
logger.info(f"
|
|
116
|
+
logger.info(f"created local source connector: {local_connector.to_json()}")
|
|
117
117
|
local_connector.initialize()
|
|
118
118
|
return local_connector.get_ingest_docs()
|