unstructured-ingest 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1 +1 @@
1
- __version__ = "1.2.0" # pragma: no cover
1
+ __version__ = "1.2.2" # pragma: no cover
@@ -67,14 +67,14 @@ class BaseEmbeddingEncoder(BaseEncoder, ABC):
67
67
  elements = elements.copy()
68
68
  elements_with_text = [e for e in elements if e.get("text")]
69
69
  texts = [e["text"] for e in elements_with_text]
70
- embeddings = []
70
+ all_embeddings = []
71
71
  try:
72
72
  for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
73
- embeddings = self.embed_batch(client=client, batch=batch)
74
- embeddings.extend(embeddings)
73
+ embeddings_batch = self.embed_batch(client=client, batch=batch)
74
+ all_embeddings.extend(embeddings_batch)
75
75
  except Exception as e:
76
76
  raise self.wrap_error(e=e)
77
- for element, embedding in zip(elements_with_text, embeddings):
77
+ for element, embedding in zip(elements_with_text, all_embeddings, strict=True):
78
78
  element[EMBEDDINGS_KEY] = embedding
79
79
  return elements
80
80
 
@@ -123,14 +123,14 @@ class AsyncBaseEmbeddingEncoder(BaseEncoder, ABC):
123
123
  elements = elements.copy()
124
124
  elements_with_text = [e for e in elements if e.get("text")]
125
125
  texts = [e["text"] for e in elements_with_text]
126
- embeddings = []
126
+ all_embeddings = []
127
127
  try:
128
128
  for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
129
- embeddings = await self.embed_batch(client=client, batch=batch)
130
- embeddings.extend(embeddings)
129
+ embeddings_batch = await self.embed_batch(client=client, batch=batch)
130
+ all_embeddings.extend(embeddings_batch)
131
131
  except Exception as e:
132
132
  raise self.wrap_error(e=e)
133
- for element, embedding in zip(elements_with_text, embeddings):
133
+ for element, embedding in zip(elements_with_text, all_embeddings, strict=True):
134
134
  element[EMBEDDINGS_KEY] = embedding
135
135
  return elements
136
136
 
@@ -36,9 +36,11 @@ class Downloader(BaseProcess, BaseConnector, ABC):
36
36
  def get_download_path(self, file_data: FileData) -> Optional[Path]:
37
37
  if not file_data.source_identifiers:
38
38
  return None
39
+
39
40
  rel_path = file_data.source_identifiers.relative_path
40
41
  if not rel_path:
41
42
  return None
43
+
42
44
  rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
43
45
  return self.download_dir / Path(rel_path)
44
46
 
@@ -264,12 +264,31 @@ FsspecDownloaderConfigT = TypeVar("FsspecDownloaderConfigT", bound=FsspecDownloa
264
264
 
265
265
  @dataclass
266
266
  class FsspecDownloader(Downloader):
267
+ TEMP_DIR_PREFIX = "unstructured_"
268
+
267
269
  protocol: str
268
270
  connection_config: FsspecConnectionConfigT
269
271
  connector_type: str = CONNECTOR_TYPE
270
272
  download_config: Optional[FsspecDownloaderConfigT] = field(
271
273
  default_factory=lambda: FsspecDownloaderConfig()
272
274
  )
275
+
276
+ def get_download_path(self, file_data: FileData) -> Optional[Path]:
277
+ has_source_identifiers = file_data.source_identifiers is not None
278
+ has_filename = has_source_identifiers and file_data.source_identifiers.filename
279
+
280
+ if not (has_source_identifiers and has_filename):
281
+ return None
282
+
283
+ filename = file_data.source_identifiers.filename
284
+
285
+ mkdir_concurrent_safe(self.download_dir)
286
+
287
+ temp_dir = tempfile.mkdtemp(
288
+ prefix=self.TEMP_DIR_PREFIX,
289
+ dir=self.download_dir
290
+ )
291
+ return Path(temp_dir) / filename
273
292
 
274
293
  def is_async(self) -> bool:
275
294
  with self.connection_config.get_client(protocol=self.protocol) as client:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: unstructured_ingest
3
- Version: 1.2.0
3
+ Version: 1.2.2
4
4
  Summary: Local ETL data pipeline to get data RAG ready
5
5
  Author-email: Unstructured Technologies <devops@unstructuredai.io>
6
6
  License-Expression: Apache-2.0
@@ -1,5 +1,5 @@
1
1
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
2
- unstructured_ingest/__version__.py,sha256=hYDudzvEd17dAmuuGVXnxBsg1JfgQBVtuuoFz06SeYo,42
2
+ unstructured_ingest/__version__.py,sha256=HPy7TMxiKrkQS-Rrw57HuZN3ZHBCTvYH8fjgFH1cXxs,41
3
3
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
4
4
  unstructured_ingest/errors_v2.py,sha256=9RuRCi7lbDxCguDz07y5RiHoQiFIOWwOD7xqzJ2B3Yw,436
5
5
  unstructured_ingest/logger.py,sha256=7e_7UeK6hVOd5BQ6i9NzRUAPCS_DF839Y8TjUDywraY,1428
@@ -25,7 +25,7 @@ unstructured_ingest/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
25
25
  unstructured_ingest/embed/azure_openai.py,sha256=Q_buBkAcx9FBuTsAqKbRU8vd9vDh8JoDOEth4fFxHbg,2160
26
26
  unstructured_ingest/embed/bedrock.py,sha256=dzfCsatB0i8hUp1YnXmoImoxgvUdZ4srKI6eSvn-lYM,9132
27
27
  unstructured_ingest/embed/huggingface.py,sha256=6Gx9L3xa3cv9fX4AMuLsePJQF4T_jwkKjovfqF5X1NM,2435
28
- unstructured_ingest/embed/interfaces.py,sha256=Y3PLhgWnMDmtpugE37hlAiBIbC8izrFFXXkrPVby-HY,5137
28
+ unstructured_ingest/embed/interfaces.py,sha256=VCrCSJiEfIxKB4NL4AHgKb-0vB_SEekb47zMUW6gWf0,5211
29
29
  unstructured_ingest/embed/mixedbreadai.py,sha256=uKTqzoi4M_WeYZu-qc_TSxwJONOESzxVbBLUbD1Wbns,3922
30
30
  unstructured_ingest/embed/octoai.py,sha256=yZuD7R4mEKS4Jjyae_IrNWogMPOFFS8gW5oUllj3ROU,4540
31
31
  unstructured_ingest/embed/openai.py,sha256=09I5BIrb-iGsv92LOV46-F7oZ7j1JnJIOQFARNKVq3k,5029
@@ -34,7 +34,7 @@ unstructured_ingest/embed/vertexai.py,sha256=DphvPhiYdXTMrQxJCd-64vMs4iVdLY_BphH
34
34
  unstructured_ingest/embed/voyageai.py,sha256=EOrYzaoXOZ6C4fNkMlCgb8KA8rdfgVXN3USMFpnn0Bs,4698
35
35
  unstructured_ingest/interfaces/__init__.py,sha256=QIkWqjsq9INTa89gPuXlMlQL4s3y5TqLmPkuVuTyXcs,795
36
36
  unstructured_ingest/interfaces/connector.py,sha256=wYWIEAL99KdQDDzzDYSf_yE8p1wjThSPMgEV5qyfiPc,1885
37
- unstructured_ingest/interfaces/downloader.py,sha256=xX0ZzsFRSzZb7SAeoeQph8sIbVq13DRw-3MYkdADrY0,2918
37
+ unstructured_ingest/interfaces/downloader.py,sha256=7pJ4wpWrP645lgTx9dO0rni8chiCAjsnOFaXRtJe8IY,2936
38
38
  unstructured_ingest/interfaces/indexer.py,sha256=c2FwWJEQHfFD6vO-tGfYLpLiIs-TYViLAt8YmHfDbaM,824
39
39
  unstructured_ingest/interfaces/process.py,sha256=S3A_9gkwwGC-iQxvnpj3Er6IJAjAT5npzpSgxuFAzUM,449
40
40
  unstructured_ingest/interfaces/processor.py,sha256=VX7JqXlbG1plxMK8THWhWINPbTICaaUEk4XUXhnOixY,3303
@@ -109,7 +109,7 @@ unstructured_ingest/processes/connectors/fsspec/__init__.py,sha256=3HTdw4L4mdN4W
109
109
  unstructured_ingest/processes/connectors/fsspec/azure.py,sha256=31VNiG5YnXfhrFX7QJ2O1ubeWHxbe1sYVIztefbscAQ,7148
110
110
  unstructured_ingest/processes/connectors/fsspec/box.py,sha256=1gLS7xR2vbjgKBrQ4ZpI1fKTsJuIDfXuAzx_a4FzxG4,5873
111
111
  unstructured_ingest/processes/connectors/fsspec/dropbox.py,sha256=HwwKjQmjM7yFk9Esh_F20xDisRPXGUkFduzaasByRDE,8355
112
- unstructured_ingest/processes/connectors/fsspec/fsspec.py,sha256=fA9jtXnr1P4wr8VBpZ1Lx9TsZzH-FDqHoBvPUH0DnWk,17827
112
+ unstructured_ingest/processes/connectors/fsspec/fsspec.py,sha256=yIvaII_uQ6ANibyj9aysM6c7fg5vUuL2eccLb51LhWk,18497
113
113
  unstructured_ingest/processes/connectors/fsspec/gcs.py,sha256=ouxISCKpZTAj3T6pWGYbASu93wytJjl5WSICvQcrgfE,7172
114
114
  unstructured_ingest/processes/connectors/fsspec/s3.py,sha256=P5nd3hamhLFO3l5nV3lMuIxHtb_rZYFP4F6q_py3xpc,7492
115
115
  unstructured_ingest/processes/connectors/fsspec/sftp.py,sha256=pR_a2SgLjt8ffNkariHrPB1E0HVSTj5h3pt7KxTU3TI,6371
@@ -235,8 +235,8 @@ unstructured_ingest/utils/pydantic_models.py,sha256=BT_j15e4rX40wQbt8LUXbqfPhA3r
235
235
  unstructured_ingest/utils/string_and_date_utils.py,sha256=oXOI6rxXq-8ncbk7EoJK0WCcTXWj75EzKl8pfQMID3U,2522
236
236
  unstructured_ingest/utils/table.py,sha256=WZechczgVFvlodUWFcsnCGvBNh1xRm6hr0VbJTPxKAc,3669
237
237
  unstructured_ingest/utils/tls.py,sha256=Ra8Mii1F4VqErRreg76PBI0eAqPBC009l0sSHa8FdnA,448
238
- unstructured_ingest-1.2.0.dist-info/METADATA,sha256=3D67Gk9trwGIVvMh0oSUDx_aJDsYCHD0qwL1VB9ZoYw,8826
239
- unstructured_ingest-1.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
240
- unstructured_ingest-1.2.0.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
241
- unstructured_ingest-1.2.0.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
242
- unstructured_ingest-1.2.0.dist-info/RECORD,,
238
+ unstructured_ingest-1.2.2.dist-info/METADATA,sha256=kLg62BHEhhU0BK_73Qc0XqsKtrf5XN3pzD40eGXW3xM,8826
239
+ unstructured_ingest-1.2.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
240
+ unstructured_ingest-1.2.2.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
241
+ unstructured_ingest-1.2.2.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
242
+ unstructured_ingest-1.2.2.dist-info/RECORD,,