PyPI - unstructured-ingest - Versions diffs - 0.0.14__py3-none-any.whl → 0.0.15__py3-none-any.whl - Mend

unstructured-ingest 0.0.14py3-none-any.whl → 0.0.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (70) hide show

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.0.14" # pragma: no cover
1	+ __version__ = "0.0.15" # pragma: no cover

unstructured_ingest/cli/interfaces.py CHANGED Viewed

@@ -73,7 +73,7 @@ class FileOrJson(click.ParamType):
                     return value
         self.fail(
             gettext(
-                "{value} is not a valid json string nor an existing filepath.",
+                "{value} is neither a valid json string nor an existing filepath.",
             ).format(value=value),
             param,
             ctx,

unstructured_ingest/cli/utils.py CHANGED Viewed

@@ -30,7 +30,7 @@ def extract_config(flat_data: dict, config: t.Type[BaseConfig]) -> BaseConfig:
     To be able to extract a nested dataclass from a flat dictionary (as in one coming
     from a click-based options input), the config class is dynamically looked through for
     nested dataclass fields and new nested dictionaries are created to conform to the
-    shape the overall class expects whn parsing from a dict. During the process, this will create
+    shape the overall class expects when parsing from a dict. During the process, this will create
     copies of the original dictionary to avoid pruning fields but this isn't a
     problem since the `from_dict()` method ignores unneeded values.

unstructured_ingest/connector/astradb.py CHANGED Viewed

@@ -222,7 +222,7 @@ class AstraDBDestinationConnector(BaseDestinationConnector):
             raise DestinationConnectionError(f"failed to validate connection: {e}")
     def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
-        logger.info(f"Inserting / updating {len(elements_dict)} documents to Astra DB.")
+        logger.info(f"inserting / updating {len(elements_dict)} documents to Astra DB.")
         astra_db_batch_size = self.write_config.batch_size

unstructured_ingest/connector/biomed.py CHANGED Viewed

@@ -123,7 +123,7 @@ class BiomedIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
             and self.filename.is_file()
             and not self.read_config.download_only
         ):
-            logger.debug(f"Cleaning up {self}")
+            logger.debug(f"cleaning up {self}")
             Path.unlink(self.filename)
     @SourceConnectionError.wrap
@@ -132,12 +132,12 @@ class BiomedIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
         download_path = self.file_meta.download_filepath  # type: ignore
         dir_ = Path(os.path.dirname(download_path))  # type: ignore
         if not dir_.is_dir():
-            logger.debug(f"Creating directory: {dir_}")
+            logger.debug(f"creating directory: {dir_}")
             if dir_:
                 dir_.mkdir(parents=True, exist_ok=True)
         self._retrieve()
-        logger.debug(f"File downloaded: {self.file_meta.download_filepath}")
+        logger.debug(f"file downloaded: {self.file_meta.download_filepath}")
     @SourceConnectionNetworkError.wrap
     def _retrieve(self):
@@ -229,7 +229,7 @@ class BiomedSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
         def traverse(path, download_dir, output_dir):
             full_path = Path(PMC_DIR) / path
-            logger.debug(f"Traversing directory: {full_path}")
+            logger.debug(f"traversing directory: {full_path}")
             ftp = FTP(DOMAIN)
             ftp.login()

unstructured_ingest/connector/chroma.py CHANGED Viewed

@@ -139,7 +139,7 @@ class ChromaDestinationConnector(BaseDestinationConnector):
         return chroma_dict
     def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
-        logger.info(f"Inserting / updating {len(elements_dict)} documents to destination ")
+        logger.info(f"inserting / updating {len(elements_dict)} documents to destination ")
         chroma_batch_size = self.write_config.batch_size

unstructured_ingest/connector/databricks_volumes.py CHANGED Viewed

@@ -112,10 +112,10 @@ class DatabricksVolumesDestinationConnector(BaseDestinationConnector):
         **kwargs,
     ) -> None:
         output_folder = self.write_config.path
-        output_folder = os.path.join(output_folder)  # Make sure folder ends with file seperator
+        output_folder = os.path.join(output_folder)  # Make sure folder ends with file separator
         filename = (
             filename.strip(os.sep) if filename else filename
-        )  # Make sure filename doesn't begin with file seperator
+        )  # Make sure filename doesn't begin with file separator
         output_path = str(PurePath(output_folder, filename)) if filename else output_folder
         logger.debug(f"uploading content to {output_path}")
         self.client.files.upload(

unstructured_ingest/connector/fsspec/box.py CHANGED Viewed

@@ -44,7 +44,7 @@ class SimpleBoxConfig(SimpleFsspecConfig):
     @requires_dependencies(["boxfs"], extras="box")
     def get_access_config(self) -> dict:
-        # Return access_kwargs with oauth. The oauth object can not be stored directly in the config
+        # Return access_kwargs with oauth. The oauth object cannot be stored directly in the config
         # because it is not serializable.
         from boxsdk import JWTAuth

unstructured_ingest/connector/fsspec/fsspec.py CHANGED Viewed

@@ -221,12 +221,12 @@ class FsspecSourceConnector(
         for pattern in patterns:
             if fnmatch.filter([path], pattern):
                 return True
-        logger.debug(f"The file {path!r} is discarded as it does not match any given glob.")
+        logger.debug(f"the file {path!r} is discarded as it does not match any given glob.")
         return False
     def get_ingest_docs(self):
         raw_files = self._list_files()
-        # If glob filters provided, use to fiter on filepaths
+        # If glob filters provided, use to filter on filepaths
         files = [f for f in raw_files if self.does_path_match_glob(f)]
         # remove compressed files
         compressed_file_ext = TAR_FILE_EXT + ZIP_FILE_EXT
@@ -328,13 +328,13 @@ class FsspecDestinationConnector(BaseDestinationConnector):
             **self.connector_config.get_access_config(),
         )
-        logger.info(f"Writing content using filesystem: {type(fs).__name__}")
+        logger.info(f"writing content using filesystem: {type(fs).__name__}")
         output_folder = self.connector_config.path_without_protocol
-        output_folder = os.path.join(output_folder)  # Make sure folder ends with file seperator
+        output_folder = os.path.join(output_folder)  # Make sure folder ends with file separator
         filename = (
             filename.strip(os.sep) if filename else filename
-        )  # Make sure filename doesn't begin with file seperator
+        )  # Make sure filename doesn't begin with file separator
         output_path = str(PurePath(output_folder, filename)) if filename else output_folder
         full_output_path = f"{self.connector_config.protocol}://{output_path}"
         logger.debug(f"uploading content to {full_output_path}")

unstructured_ingest/connector/git.py CHANGED Viewed

@@ -120,5 +120,5 @@ class GitSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
         for pattern in patterns:
             if fnmatch.filter([path], pattern):
                 return True
-        logger.debug(f"The file {path!r} is discarded as it does not match any given glob.")
+        logger.debug(f"the file {path!r} is discarded as it does not match any given glob.")
         return False

unstructured_ingest/connector/google_drive.py CHANGED Viewed

@@ -222,7 +222,7 @@ class GoogleDriveIngestDoc(IngestDocSessionHandleMixin, IngestDocCleanupMixin, B
             dir_ = Path(self.meta["download_dir"])
             if dir_:
                 if not dir_.is_dir():
-                    logger.debug(f"Creating directory: {self.meta.get('download_dir')}")
+                    logger.debug(f"creating directory: {self.meta.get('download_dir')}")
                     if dir_:
                         dir_.mkdir(parents=True, exist_ok=True)
@@ -230,7 +230,7 @@ class GoogleDriveIngestDoc(IngestDocSessionHandleMixin, IngestDocCleanupMixin, B
                 with open(self.filename, "wb") as handler:
                     handler.write(file.getbuffer())
                     saved = True
-                    logger.debug(f"File downloaded: {self.filename}.")
+                    logger.debug(f"file downloaded: {self.filename}.")
         if not saved:
             logger.error(f"Error while downloading and saving file: {self.filename}.")
@@ -241,7 +241,7 @@ class GoogleDriveIngestDoc(IngestDocSessionHandleMixin, IngestDocCleanupMixin, B
         self._output_filename.parent.mkdir(parents=True, exist_ok=True)
         with open(self._output_filename, "w") as output_f:
             output_f.write(json.dumps(self.isd_elems_no_filename, ensure_ascii=False, indent=2))
-        logger.info(f"Wrote {self._output_filename}")
+        logger.info(f"wrote {self._output_filename}")
 @dataclass
@@ -295,7 +295,7 @@ class GoogleDriveSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnecto
                                 guess = guess_extension(export_mime)
                                 ext = guess if guess else ext
-                        # TODO (Habeeb): Consider filtering at the query level.
+                        # TODO(Habeeb): Consider filtering at the query level.
                         if (
                             self.connector_config.extension
                             and self.connector_config.extension != ext

unstructured_ingest/connector/hubspot.py CHANGED Viewed

@@ -271,7 +271,7 @@ class HubSpotSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
         ingest_docs: t.List[HubSpotIngestDoc] = []
         for obj_name, obj_method in obj_method_resolver.items():
-            logger.info(f"Retrieving - {obj_name}")
+            logger.info(f"retrieving - {obj_name}")
             results: t.List[HubSpotIngestDoc] = obj_method()  # type: ignore
             ingest_docs += results  # type: ignore

unstructured_ingest/connector/kafka.py CHANGED Viewed

@@ -114,7 +114,7 @@ class KafkaSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
     def initialize(self):
         topic = self.connector_config.topic
-        logger.info(f"Subscribing to topic: {topic}")
+        logger.info(f"subscribing to topic: {topic}")
         self.kafka_consumer.subscribe([topic])
     @property
@@ -149,7 +149,7 @@ class KafkaSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
             conf["sasl.password"] = secret
         consumer = Consumer(conf)
-        logger.debug(f"Kafka Consumer connected to bootstrap: {bootstrap}")
+        logger.debug(f"kafka consumer connected to bootstrap: {bootstrap}")
         return consumer
     @SourceConnectionError.wrap
@@ -161,7 +161,7 @@ class KafkaSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
         collected = []
         num_messages_to_consume = self.connector_config.num_messages_to_consume
-        logger.info(f"Config set for blocking on {num_messages_to_consume} messages")
+        logger.info(f"config set for blocking on {num_messages_to_consume} messages")
         # Consume specified number of messages
         while running:
             msg = consumer.poll(timeout=self.connector_config.timeout)
@@ -178,7 +178,7 @@ class KafkaSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
             else:
                 collected.append(json.loads(msg.value().decode("utf8")))
                 if len(collected) >= num_messages_to_consume:
-                    logger.debug(f"Found {len(collected)} messages, stopping")
+                    logger.debug(f"found {len(collected)} messages, stopping")
                     consumer.commit(asynchronous=False)
                     break
@@ -243,7 +243,7 @@ class KafkaDestinationConnector(IngestDocSessionHandleMixin, BaseDestinationConn
             conf["sasl.password"] = secret
         producer = Producer(conf)
-        logger.debug(f"Connected to bootstrap: {bootstrap}")
+        logger.debug(f"connected to bootstrap: {bootstrap}")
         return producer
     def check_connection(self):
@@ -255,7 +255,7 @@ class KafkaDestinationConnector(IngestDocSessionHandleMixin, BaseDestinationConn
     @DestinationConnectionError.wrap
     def upload_msg(self, batch) -> int:
-        logger.debug(f"Uploading batch: {batch}")
+        logger.debug(f"uploading batch: {batch}")
         topic = self.connector_config.topic
         producer = self.kafka_producer
         uploaded = 0
@@ -267,7 +267,7 @@ class KafkaDestinationConnector(IngestDocSessionHandleMixin, BaseDestinationConn
     @DestinationConnectionError.wrap
     def write_dict(self, *args, dict_list: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
-        logger.info(f"Writing {len(dict_list)} documents to Kafka")
+        logger.info(f"writing {len(dict_list)} documents to Kafka")
         num_uploaded = 0
         for chunk in batch_generator(dict_list, self.write_config.batch_size):
@@ -275,7 +275,7 @@ class KafkaDestinationConnector(IngestDocSessionHandleMixin, BaseDestinationConn
         producer = self.kafka_producer
         producer.flush()
-        logger.info(f"Uploaded {num_uploaded} documents to Kafka")
+        logger.info(f"uploaded {num_uploaded} documents to Kafka")
     def write(self, docs: t.List[BaseIngestDoc]) -> None:
         content_list: t.List[t.Dict[str, t.Any]] = []

unstructured_ingest/connector/local.py CHANGED Viewed

@@ -123,7 +123,7 @@ class LocalSourceConnector(BaseSourceConnector):
         for pattern in patterns:
             if fnmatch.filter([path], pattern):
                 return True
-        logger.debug(f"The file {path!r} is discarded as it does not match any given glob.")
+        logger.debug(f"the file {path!r} is discarded as it does not match any given glob.")
         return False
     def get_ingest_docs(self):

unstructured_ingest/connector/notion/helpers.py CHANGED Viewed

@@ -103,7 +103,7 @@ def extract_page_html(
             ):
                 children.extend(children_block)
             if children:
-                logger.debug(f"Adding {len(children)} children from parent: {parent}")
+                logger.debug(f"adding {len(children)} children from parent: {parent}")
                 for child in children:
                     if child.id not in processed_block_ids:
                         parents.append((level + 1, child))
@@ -159,7 +159,7 @@ def extract_database_html(
     for page_chunk in client.databases.iterate_query(database_id=database_id):  # type: ignore
         all_pages.extend(page_chunk)
-    logger.debug(f"Creating {len(all_pages)} rows")
+    logger.debug(f"creating {len(all_pages)} rows")
     for page in all_pages:
         if is_database_url(client=client, url=page.url):
             child_databases.append(page.id)
@@ -237,7 +237,7 @@ def get_recursive_content(
         parent: QueueEntry = parents.pop()
         processed.append(str(parent.id))
         if parent.type == QueueEntryType.PAGE:
-            logger.debug(f"Getting child data from page: {parent.id}")
+            logger.debug(f"getting child data from page: {parent.id}")
             page_children = []
             try:
                 for children_block in client.blocks.children.iterate_list(  # type: ignore
@@ -316,7 +316,7 @@ def get_recursive_content(
                     )
         elif parent.type == QueueEntryType.DATABASE:
-            logger.debug(f"Getting child data from database: {parent.id}")
+            logger.debug(f"getting child data from database: {parent.id}")
             database_pages = []
             try:
                 for page_entries in client.databases.iterate_query(  # type: ignore

unstructured_ingest/connector/onedrive.py CHANGED Viewed

@@ -157,17 +157,17 @@ class OneDriveIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
         self.output_dir.mkdir(parents=True, exist_ok=True)
         if not self.download_dir.is_dir():
-            logger.debug(f"Creating directory: {self.download_dir}")
+            logger.debug(f"creating directory: {self.download_dir}")
             self.download_dir.mkdir(parents=True, exist_ok=True)
         if fsize > MAX_MB_SIZE:
-            logger.info(f"Downloading file with size: {fsize} bytes in chunks")
+            logger.info(f"downloading file with size: {fsize} bytes in chunks")
             with self.filename.open(mode="wb") as f:
                 file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query()
         else:
             with self.filename.open(mode="wb") as f:
                 file.download(f).execute_query()
-        logger.info(f"File downloaded: {self.filename}")
+        logger.info(f"file downloaded: {self.filename}")
         return

unstructured_ingest/connector/outlook.py CHANGED Viewed

@@ -164,7 +164,7 @@ class OutlookIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
             self.connector_config._get_client()
             self.update_source_metadata()
             if not self.download_dir.is_dir():
-                logger.debug(f"Creating directory: {self.download_dir}")
+                logger.debug(f"creating directory: {self.download_dir}")
                 self.download_dir.mkdir(parents=True, exist_ok=True)
             with open(
@@ -182,7 +182,7 @@ class OutlookIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
             )
             logger.error(e)
             return
-        logger.info(f"File downloaded: {self.hash_mail_name(self.message_id)}")
+        logger.info(f"file downloaded: {self.hash_mail_name(self.message_id)}")
         return

unstructured_ingest/connector/pinecone.py CHANGED Viewed

@@ -80,7 +80,7 @@ class PineconeDestinationConnector(IngestDocSessionHandleMixin, BaseDestinationC
         )
         index = pc.Index(self.connector_config.index_name)
-        logger.debug(f"Connected to index: {pc.describe_index(self.connector_config.index_name)}")
+        logger.debug(f"connected to index: {pc.describe_index(self.connector_config.index_name)}")
         return index
     @DestinationConnectionError.wrap

unstructured_ingest/connector/sharepoint.py CHANGED Viewed

@@ -253,11 +253,11 @@ class SharepointIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
         self.output_dir.mkdir(parents=True, exist_ok=True)
         if not self.download_dir.is_dir():
-            logger.debug(f"Creating directory: {self.download_dir}")
+            logger.debug(f"creating directory: {self.download_dir}")
             self.download_dir.mkdir(parents=True, exist_ok=True)
         with self.filename.open(mode="w") as f:
             f.write(pld)
-        logger.info(f"File downloaded: {self.filename}")
+        logger.info(f"file downloaded: {self.filename}")
     def _download_file(self):
         file = self._fetch_file()
@@ -266,17 +266,17 @@ class SharepointIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
         self.output_dir.mkdir(parents=True, exist_ok=True)
         if not self.download_dir.is_dir():
-            logger.debug(f"Creating directory: {self.download_dir}")
+            logger.debug(f"creating directory: {self.download_dir}")
             self.download_dir.mkdir(parents=True, exist_ok=True)
         if fsize > MAX_MB_SIZE:
-            logger.info(f"Downloading file with size: {fsize} bytes in chunks")
+            logger.info(f"downloading file with size: {fsize} bytes in chunks")
             with self.filename.open(mode="wb") as f:
                 file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query()
         else:
             with self.filename.open(mode="wb") as f:
                 file.download(f).execute_query()
-        logger.info(f"File downloaded: {self.filename}")
+        logger.info(f"file downloaded: {self.filename}")
     @BaseSingleIngestDoc.skip_if_file_exists
     @SourceConnectionError.wrap
@@ -374,7 +374,7 @@ class SharepointSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector
         if self.connector_config.process_pages:
             page_output = self._list_pages(site_client)
             if not page_output:
-                logger.info(f"Couldn't process pages for site {site_client.base_url}")
+                logger.info(f"couldn't process pages for site {site_client.base_url}")
             output = output + page_output
         return output
@@ -404,7 +404,7 @@ class SharepointSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector
         tenant_sites = {s.url for s in tenant_sites if (s.url is not None)}
         ingest_docs: t.List[SharepointIngestDoc] = []
         for site_url in tenant_sites:
-            logger.info(f"Processing docs for site: {site_url}")
+            logger.info(f"processing docs for site: {site_url}")
             site_client = self.connector_config.get_site_client(site_url)
             ingest_docs = ingest_docs + self._ingest_site_docs(site_client)
         return ingest_docs
@@ -440,7 +440,7 @@ class SharepointPermissionsConnector:
         if response.status_code == 200:
             return response.json()
         else:
-            logger.info(f"Request failed with status code {response.status_code}:")
+            logger.info(f"request failed with status code {response.status_code}:")
             logger.info(response.text)
     @requires_dependencies(["requests"], extras="sharepoint")

unstructured_ingest/connector/vectara.py CHANGED Viewed

@@ -181,7 +181,7 @@ class VectaraDestinationConnector(BaseDestinationConnector):
         try:
             result = self._request(endpoint="index", data=body, http_method="POST")
         except Exception as e:
-            logger.info(f"Exception {e} while indexing document {document['documentId']}")
+            logger.info(f"exception {e} while indexing document {document['documentId']}")
             return
         if (
@@ -196,18 +196,18 @@ class VectaraDestinationConnector(BaseDestinationConnector):
                 )
             )
         ):
-            logger.info(f"Document {document['documentId']} already exists, re-indexing")
+            logger.info(f"document {document['documentId']} already exists, re-indexing")
             self._delete_doc(document["documentId"])
             result = self._request(endpoint="index", data=body, http_method="POST")
             return
         if "status" in result and result["status"] and "OK" in result["status"]["code"]:
-            logger.info(f"Indexing document {document['documentId']} succeeded")
+            logger.info(f"indexing document {document['documentId']} succeeded")
         else:
-            logger.info(f"Indexing document {document['documentId']} failed, response = {result}")
+            logger.info(f"indexing document {document['documentId']} failed, response = {result}")
     def write_dict(self, *args, docs_list: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
-        logger.info(f"Inserting / updating {len(docs_list)} documents to Vectara ")
+        logger.info(f"inserting / updating {len(docs_list)} documents to Vectara ")
         for vdoc in docs_list:
             self._index_document(vdoc)
@@ -216,7 +216,7 @@ class VectaraDestinationConnector(BaseDestinationConnector):
         def get_metadata(element) -> t.Dict[str, t.Any]:
             """
-            Select which meta-data fields to include and optionaly map them to a new new.
+            Select which meta-data fields to include and optionally map them to a new new.
             remove the "metadata-" prefix from the keys
             """
             metadata_map = {

unstructured_ingest/interfaces.py CHANGED Viewed

@@ -529,7 +529,7 @@ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
                 and self.filename.is_file()
                 and self.filename.stat().st_size
             ):
-                logger.debug(f"File exists: {self.filename}, skipping {func.__name__}")
+                logger.debug(f"file exists: {self.filename}, skipping {func.__name__}")
                 return None
             return func(self, *args, **kwargs)
@@ -586,7 +586,7 @@ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
             endpoint = partition_config.partition_endpoint
-            logger.debug(f"Using remote partition ({endpoint})")
+            logger.debug(f"using remote partition ({endpoint})")
             elements = partition_via_api(
                 filename=str(self.filename),
@@ -606,7 +606,7 @@ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
         self._date_processed = datetime.utcnow().isoformat()
         if self.read_config.download_only:
             return None
-        logger.info(f"Processing {self.filename}")
+        logger.info(f"processing {self.filename}")
         elements = self.partition_file(partition_config=partition_config, **partition_kwargs)
         element_dicts = [e.to_dict() for e in elements]
@@ -824,7 +824,7 @@ class IngestDocCleanupMixin:
             and self.filename.is_file()
             and not self.read_config.download_only
         ):
-            logger.debug(f"Cleaning up {self}")
+            logger.debug(f"cleaning up {self}")
             os.unlink(self.filename)

unstructured_ingest/logger.py CHANGED Viewed

@@ -95,7 +95,7 @@ class SensitiveFormatter(logging.Formatter):
 def remove_root_handlers(logger: logging.Logger) -> None:
-    # NOTE(robinson) - in some environments such as Google Colab, there is a root handler
+    # NOTE(robinson): in some environments such as Google Colab, there is a root handler
     # that doesn't not mask secrets, meaning sensitive info such as api keys appear in logs.
     # Removing these when they exist prevents this behavior
     if logger.root.hasHandlers():

unstructured_ingest/pipeline/copy.py CHANGED Viewed

@@ -15,5 +15,5 @@ class Copier(CopyNode):
         ingest_doc = create_ingest_doc_from_dict(ingest_doc_dict)
         desired_output = ingest_doc._output_filename
         Path(desired_output).parent.mkdir(parents=True, exist_ok=True)
-        logger.info(f"Copying {json_path} -> {desired_output}")
+        logger.info(f"copying {json_path} -> {desired_output}")
         shutil.copy(json_path, desired_output)

unstructured_ingest/pipeline/interfaces.py CHANGED Viewed

@@ -57,7 +57,7 @@ class PipelineNode(DataClassJsonMixin, ABC):
         iterable = iterable if iterable else []
         if iterable:
             logger.info(
-                f"Calling {self.__class__.__name__} " f"with {len(iterable)} docs",  # type: ignore
+                f"calling {self.__class__.__name__} " f"with {len(iterable)} docs",  # type: ignore
             )
         self.initialize()
@@ -92,7 +92,7 @@ class PipelineNode(DataClassJsonMixin, ABC):
     def initialize(self):
         if path := self.get_path():
-            logger.info(f"Creating {path}")
+            logger.info(f"creating {path}")
             path.mkdir(parents=True, exist_ok=True)
         ingest_log_streaming_init(logging.DEBUG if self.pipeline_context.verbose else logging.INFO)

unstructured_ingest/pipeline/partition.py CHANGED Viewed

@@ -30,7 +30,7 @@ class Partitioner(PartitionNode):
                 and json_path.is_file()
                 and json_path.stat().st_size
             ):
-                logger.info(f"File exists: {json_path}, skipping partition")
+                logger.info(f"file exists: {json_path}, skipping partition")
                 return str(json_path)
             partition_kwargs: t.Dict[str, t.Any] = {
                 "strategy": self.partition_config.strategy,

unstructured_ingest/pipeline/pipeline.py CHANGED Viewed

@@ -96,7 +96,7 @@ class Pipeline(DataClassJsonMixin):
         for reformat_node in self.reformat_nodes:
             reformatted_jsons = reformat_node(iterable=partitioned_jsons)
             if not reformatted_jsons:
-                logger.info(f"No files to process after {reformat_node.__class__.__name__}")
+                logger.info(f"no files to process after {reformat_node.__class__.__name__}")
                 return
             partitioned_jsons = reformatted_jsons

unstructured_ingest/pipeline/reformat/chunking.py CHANGED Viewed

@@ -58,7 +58,7 @@ class Chunker(ReformatNode):
                 and json_path.is_file()
                 and json_path.stat().st_size
             ):
-                logger.debug(f"File exists: {json_path}, skipping chunking")
+                logger.debug(f"file exists: {json_path}, skipping chunking")
                 return str(json_path)
             chunked_elements = self.chunk(elements_json)
@@ -112,7 +112,7 @@ class Chunker(ReformatNode):
             return partition_via_api(
                 filename=elements_json_file,
-                # -- (jennings) If api_key or api_url are None, partition_via_api will raise an
+                # -- NOTE(jennings): If api_key or api_url are None, partition_via_api will raise an
                 # -- error, which will be caught and logged by Chunker.run()
                 api_key=self.partition_config.api_key,  # type: ignore
                 api_url=self.partition_config.partition_endpoint,  # type: ignore

unstructured_ingest/pipeline/reformat/embedding.py CHANGED Viewed

@@ -44,7 +44,7 @@ class Embedder(ReformatNode):
                 and json_path.is_file()
                 and json_path.stat().st_size
             ):
-                logger.debug(f"File exists: {json_path}, skipping embedding")
+                logger.debug(f"file exists: {json_path}, skipping embedding")
                 return str(json_path)
             with open(elements_json) as f:
                 elements = json.load(f)

unstructured_ingest/pipeline/source.py CHANGED Viewed

@@ -24,12 +24,12 @@ class Reader(SourceNode):
             and doc.filename.is_file()
             and doc.filename.stat().st_size
         ):
-            logger.info(f"File exists: {doc.filename}, skipping download")
+            logger.info(f"file exists: {doc.filename}, skipping download")
             # Still need to fetch metadata if file exists locally
             doc.update_source_metadata()
         else:
             serialized_doc = doc.to_json(redact_sensitive=True)
-            logger.debug(f"Fetching {serialized_doc} - PID: {os.getpid()}")
+            logger.debug(f"fetching {serialized_doc} - PID: {os.getpid()}")
             if self.retry_strategy:
                 self.retry_strategy(doc.get_file)
             else:

unstructured_ingest/utils/compression.py CHANGED Viewed

@@ -22,7 +22,7 @@ TAR_FILE_EXT = [".tar", ".tar.gz", ".tgz"]
 def uncompress_file(filename: str, path: Optional[str] = None) -> str:
     """
-    Takes in a compressed zip or tar file and uncompresses it
+    Takes in a compressed zip or tar file and decompresses it
     """
     # Create path if it doesn't already exist
     if path:
@@ -65,7 +65,7 @@ def uncompress_tar_file(tar_filename: str, path: Optional[str] = None) -> str:
     logger.info(f"extracting tar {tar_filename} -> {path}")
     # NOTE: "r:*" mode opens both compressed (e.g ".tar.gz") and uncompressed ".tar" archives
     with tarfile.open(tar_filename, "r:*") as tfile:
-        # NOTE(robinson: Mitigate against malicious content being extracted from the tar file.
+        # NOTE(robinson): Mitigate against malicious content being extracted from the tar file.
         # This was added in Python 3.12
         # Ref: https://docs.python.org/3/library/tarfile.html#extraction-filters
         if sys.version_info >= (3, 12):
@@ -113,6 +113,6 @@ class CompressionSourceConnectorMixin:
             read_config=new_read_configs,
             processor_config=new_process_configs,
         )
-        logger.info(f"Created local source connector: {local_connector.to_json()}")
+        logger.info(f"created local source connector: {local_connector.to_json()}")
         local_connector.initialize()
         return local_connector.get_ingest_docs()

unstructured-ingest 0.0.14__py3-none-any.whl → 0.0.15__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.0.14py3-none-any.whl → 0.0.15py3-none-any.whl