unstructured-ingest 1.0.1__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1 +1 @@
1
- __version__ = "1.0.1" # pragma: no cover
1
+ __version__ = "1.0.2" # pragma: no cover
@@ -1,5 +1,7 @@
1
+ import asyncio
1
2
  import csv
2
3
  import hashlib
4
+ import os
3
5
  import re
4
6
  from dataclasses import dataclass, field
5
7
  from pathlib import Path
@@ -8,7 +10,6 @@ from typing import TYPE_CHECKING, Any, Generator, Optional
8
10
 
9
11
  from pydantic import BaseModel, Field, Secret
10
12
 
11
- from unstructured_ingest import __name__ as integration_name
12
13
  from unstructured_ingest.__version__ import __version__ as integration_version
13
14
  from unstructured_ingest.data_types.file_data import (
14
15
  BatchFileData,
@@ -83,10 +84,8 @@ class AstraDBConnectionConfig(ConnectionConfig):
83
84
 
84
85
  # Create a client object to interact with the Astra DB
85
86
  # caller_name/version for Astra DB tracking
86
- return AstraDBClient(
87
- caller_name=integration_name,
88
- caller_version=integration_version,
89
- )
87
+ user_agent = os.getenv("UNSTRUCTURED_USER_AGENT", "unstructuredio_oss")
88
+ return AstraDBClient(callers=[(user_agent, integration_version)])
90
89
 
91
90
 
92
91
  def get_astra_db(
@@ -141,7 +140,7 @@ async def get_async_astra_collection(
141
140
  )
142
141
 
143
142
  # Get async collection from AsyncDatabase
144
- async_astra_db_collection = await async_astra_db.get_collection(name=collection_name)
143
+ async_astra_db_collection = async_astra_db.get_collection(name=collection_name)
145
144
  return async_astra_db_collection
146
145
 
147
146
 
@@ -360,13 +359,22 @@ class AstraDBUploader(Uploader):
360
359
  upload_config: AstraDBUploaderConfig
361
360
  connector_type: str = CONNECTOR_TYPE
362
361
 
362
+ def is_async(self) -> bool:
363
+ return True
364
+
363
365
  def init(self, **kwargs: Any) -> None:
364
366
  self.create_destination(**kwargs)
365
367
 
368
+ @requires_dependencies(["astrapy"], extras="astradb")
366
369
  def precheck(self) -> None:
367
370
  try:
368
371
  if self.upload_config.collection_name:
369
- self.get_collection(collection_name=self.upload_config.collection_name).options()
372
+ collection = get_astra_collection(
373
+ connection_config=self.connection_config,
374
+ collection_name=self.upload_config.collection_name,
375
+ keyspace=self.upload_config.keyspace,
376
+ )
377
+ collection.options()
370
378
  else:
371
379
  # check for db connection only if collection name is not provided
372
380
  get_astra_db(
@@ -377,17 +385,7 @@ class AstraDBUploader(Uploader):
377
385
  logger.error(f"Failed to validate connection {e}", exc_info=True)
378
386
  raise DestinationConnectionError(f"failed to validate connection: {e}")
379
387
 
380
- @requires_dependencies(["astrapy"], extras="astradb")
381
- def get_collection(self, collection_name: Optional[str] = None) -> "AstraDBCollection":
382
- return get_astra_collection(
383
- connection_config=self.connection_config,
384
- collection_name=collection_name or self.upload_config.collection_name,
385
- keyspace=self.upload_config.keyspace,
386
- )
387
-
388
388
  def _collection_exists(self, collection_name: str):
389
- from astrapy.exceptions import CollectionNotFoundException
390
-
391
389
  collection = get_astra_collection(
392
390
  connection_config=self.connection_config,
393
391
  collection_name=collection_name,
@@ -397,8 +395,10 @@ class AstraDBUploader(Uploader):
397
395
  try:
398
396
  collection.options()
399
397
  return True
400
- except CollectionNotFoundException:
401
- return False
398
+ except RuntimeError as e:
399
+ if "not found" in str(e):
400
+ return False
401
+ raise DestinationConnectionError(f"failed to check if astra collection exists : {e}")
402
402
  except Exception as e:
403
403
  logger.error(f"failed to check if astra collection exists : {e}")
404
404
  raise DestinationConnectionError(f"failed to check if astra collection exists : {e}")
@@ -422,6 +422,8 @@ class AstraDBUploader(Uploader):
422
422
  self.upload_config.collection_name = collection_name
423
423
 
424
424
  if not self._collection_exists(collection_name):
425
+ from astrapy.info import CollectionDefinition
426
+
425
427
  astra_db = get_astra_db(
426
428
  connection_config=self.connection_config, keyspace=self.upload_config.keyspace
427
429
  )
@@ -429,44 +431,56 @@ class AstraDBUploader(Uploader):
429
431
  f"creating default astra collection '{collection_name}' with dimension "
430
432
  f"{vector_length} and metric {similarity_metric}"
431
433
  )
432
- astra_db.create_collection(
433
- collection_name,
434
- dimension=vector_length,
435
- metric=similarity_metric,
434
+ definition = (
435
+ CollectionDefinition.builder()
436
+ .set_vector_dimension(dimension=vector_length)
437
+ .set_vector_metric(similarity_metric)
438
+ .build()
436
439
  )
440
+ (astra_db.create_collection(collection_name, definition=definition),)
437
441
  return True
438
442
  logger.debug(f"collection with name '{collection_name}' already exists, skipping creation")
439
443
  return False
440
444
 
441
- def delete_by_record_id(self, collection: "AstraDBCollection", file_data: FileData):
445
+ async def delete_by_record_id(self, collection: "AstraDBAsyncCollection", file_data: FileData):
442
446
  logger.debug(
443
447
  f"deleting records from collection {collection.name} "
444
448
  f"with {self.upload_config.record_id_key} "
445
449
  f"set to {file_data.identifier}"
446
450
  )
447
451
  delete_filter = {self.upload_config.record_id_key: {"$eq": file_data.identifier}}
448
- delete_resp = collection.delete_many(filter=delete_filter)
452
+ delete_resp = await collection.delete_many(filter=delete_filter)
449
453
  logger.debug(
450
454
  f"deleted {delete_resp.deleted_count} records from collection {collection.name}"
451
455
  )
452
456
 
453
- def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
457
+ async def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
454
458
  logger.info(
455
459
  f"writing {len(data)} objects to destination "
456
460
  f"collection {self.upload_config.collection_name}"
457
461
  )
458
462
 
459
463
  astra_db_batch_size = self.upload_config.batch_size
460
- collection = self.get_collection()
461
-
462
- self.delete_by_record_id(collection=collection, file_data=file_data)
464
+ async_astra_collection = await get_async_astra_collection(
465
+ connection_config=self.connection_config,
466
+ collection_name=self.upload_config.collection_name,
467
+ keyspace=self.upload_config.keyspace,
468
+ )
463
469
 
464
- for chunk in batch_generator(data, astra_db_batch_size):
465
- collection.insert_many(chunk)
470
+ await self.delete_by_record_id(collection=async_astra_collection, file_data=file_data)
471
+ await asyncio.gather(
472
+ *[
473
+ async_astra_collection.insert_many(chunk)
474
+ for chunk in batch_generator(data, astra_db_batch_size)
475
+ ]
476
+ )
466
477
 
467
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
478
+ async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
468
479
  data = get_json_data(path=path)
469
- self.run_data(data=data, file_data=file_data, **kwargs)
480
+ await self.run_data(data=data, file_data=file_data)
481
+
482
+ def run(self, **kwargs: Any) -> Any:
483
+ raise NotImplementedError("Use astradb run_async instead")
470
484
 
471
485
 
472
486
  astra_db_source_entry = SourceRegistryEntry(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: unstructured_ingest
3
- Version: 1.0.1
3
+ Version: 1.0.2
4
4
  Summary: Local ETL data pipeline to get data RAG ready
5
5
  Author-email: Unstructured Technologies <devops@unstructuredai.io>
6
6
  License-Expression: Apache-2.0
@@ -28,7 +28,7 @@ Provides-Extra: airtable
28
28
  Requires-Dist: pandas; extra == 'airtable'
29
29
  Requires-Dist: pyairtable; extra == 'airtable'
30
30
  Provides-Extra: astradb
31
- Requires-Dist: astrapy; extra == 'astradb'
31
+ Requires-Dist: astrapy>2.0.0; extra == 'astradb'
32
32
  Provides-Extra: azure
33
33
  Requires-Dist: adlfs; extra == 'azure'
34
34
  Requires-Dist: fsspec; extra == 'azure'
@@ -1,5 +1,5 @@
1
1
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
2
- unstructured_ingest/__version__.py,sha256=Bkcw0TdkF4pWY_01piNW3D1XaG9Q-r4aIMSbnIeStCE,42
2
+ unstructured_ingest/__version__.py,sha256=tMfsOjk6uygoNUsekl3a802jffTlVo6ELbuAqqeWH0c,42
3
3
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
4
4
  unstructured_ingest/errors_v2.py,sha256=9RuRCi7lbDxCguDz07y5RiHoQiFIOWwOD7xqzJ2B3Yw,436
5
5
  unstructured_ingest/logger.py,sha256=7e_7UeK6hVOd5BQ6i9NzRUAPCS_DF839Y8TjUDywraY,1428
@@ -62,7 +62,7 @@ unstructured_ingest/processes/partitioner.py,sha256=Kn_BSFYvOkwo8fqThw_cOpgD0Um-
62
62
  unstructured_ingest/processes/uncompress.py,sha256=o9JL3Bza4KPUTmrB39-v_5SuK_fYwhwFAhjQi2Pm8h8,2426
63
63
  unstructured_ingest/processes/connectors/__init__.py,sha256=cR4ZH2dpPod7QR6OsgMx8X9kpFcEc1TVfQndUNoKGzI,6812
64
64
  unstructured_ingest/processes/connectors/airtable.py,sha256=smx5qBSUKwM8V6Xcc7ikrf8hYQUQ94YrB1L0WVeRDv0,9024
65
- unstructured_ingest/processes/connectors/astradb.py,sha256=ONt8vHv5h8B6goGba9l0YPS0y5EnSAoowtfq92-E-RY,18307
65
+ unstructured_ingest/processes/connectors/astradb.py,sha256=Ob9wQgDxa6BXDPZBOqooNKQgvjIZcMwIe4fW3VlI7h8,18929
66
66
  unstructured_ingest/processes/connectors/azure_ai_search.py,sha256=szhSRXzUHk0DE2hGFfjGc_jNFzlUwiRlCtIkuu7tmnk,11524
67
67
  unstructured_ingest/processes/connectors/chroma.py,sha256=q5_Fu4xb6_W_NyrPxVa3-jVwZLqVdlBNlR4dFvbd7l0,7235
68
68
  unstructured_ingest/processes/connectors/confluence.py,sha256=BbZ-Ecdcn92X8dHQ0egEJtBoX16gM0-zMcBLdn-wQsM,12090
@@ -230,8 +230,8 @@ unstructured_ingest/utils/ndjson.py,sha256=nz8VUOPEgAFdhaDOpuveknvCU4x82fVwqE01q
230
230
  unstructured_ingest/utils/pydantic_models.py,sha256=BT_j15e4rX40wQbt8LUXbqfPhA3rJn1PHTI_G_A_EHY,1720
231
231
  unstructured_ingest/utils/string_and_date_utils.py,sha256=oXOI6rxXq-8ncbk7EoJK0WCcTXWj75EzKl8pfQMID3U,2522
232
232
  unstructured_ingest/utils/table.py,sha256=WZechczgVFvlodUWFcsnCGvBNh1xRm6hr0VbJTPxKAc,3669
233
- unstructured_ingest-1.0.1.dist-info/METADATA,sha256=k_kEG2BSsnNaIyDSJWiciUW0Z-HDiPF_flO6kLjn8QI,8713
234
- unstructured_ingest-1.0.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
235
- unstructured_ingest-1.0.1.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
236
- unstructured_ingest-1.0.1.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
237
- unstructured_ingest-1.0.1.dist-info/RECORD,,
233
+ unstructured_ingest-1.0.2.dist-info/METADATA,sha256=jmorweX10DhCfe--4Uz_9mQ5HIyjcd5qigZt_jP_c1c,8719
234
+ unstructured_ingest-1.0.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
235
+ unstructured_ingest-1.0.2.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
236
+ unstructured_ingest-1.0.2.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
237
+ unstructured_ingest-1.0.2.dist-info/RECORD,,