unstructured-ingest 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (123) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/cli.py +6 -1
  3. unstructured_ingest/cli/cmds/__init__.py +4 -4
  4. unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
  5. unstructured_ingest/cli/interfaces.py +13 -6
  6. unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
  7. unstructured_ingest/connector/biomed.py +12 -5
  8. unstructured_ingest/connector/confluence.py +3 -3
  9. unstructured_ingest/connector/github.py +3 -2
  10. unstructured_ingest/connector/google_drive.py +1 -2
  11. unstructured_ingest/connector/mongodb.py +1 -2
  12. unstructured_ingest/connector/notion/client.py +31 -16
  13. unstructured_ingest/connector/notion/connector.py +3 -2
  14. unstructured_ingest/connector/registry.py +2 -2
  15. unstructured_ingest/connector/vectara.py +7 -2
  16. unstructured_ingest/interfaces.py +13 -9
  17. unstructured_ingest/pipeline/interfaces.py +8 -3
  18. unstructured_ingest/pipeline/reformat/chunking.py +13 -9
  19. unstructured_ingest/pipeline/reformat/embedding.py +3 -3
  20. unstructured_ingest/runner/__init__.py +2 -2
  21. unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
  22. unstructured_ingest/runner/writers/__init__.py +2 -2
  23. unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
  24. unstructured_ingest/utils/chunking.py +45 -0
  25. unstructured_ingest/utils/dep_check.py +1 -1
  26. unstructured_ingest/utils/google_filetype.py +9 -0
  27. unstructured_ingest/v2/cli/base/cmd.py +57 -13
  28. unstructured_ingest/v2/cli/base/dest.py +21 -12
  29. unstructured_ingest/v2/cli/base/src.py +35 -23
  30. unstructured_ingest/v2/cli/cmds.py +14 -0
  31. unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
  32. unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
  33. unstructured_ingest/v2/interfaces/connector.py +5 -7
  34. unstructured_ingest/v2/interfaces/downloader.py +8 -5
  35. unstructured_ingest/v2/interfaces/file_data.py +8 -2
  36. unstructured_ingest/v2/interfaces/indexer.py +3 -4
  37. unstructured_ingest/v2/interfaces/processor.py +10 -10
  38. unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
  39. unstructured_ingest/v2/interfaces/uploader.py +3 -3
  40. unstructured_ingest/v2/pipeline/pipeline.py +1 -5
  41. unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
  42. unstructured_ingest/v2/pipeline/steps/download.py +13 -11
  43. unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
  44. unstructured_ingest/v2/pipeline/steps/filter.py +1 -6
  45. unstructured_ingest/v2/pipeline/steps/index.py +14 -10
  46. unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
  47. unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
  48. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
  49. unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
  50. unstructured_ingest/v2/processes/__init__.py +18 -0
  51. unstructured_ingest/v2/processes/chunker.py +74 -28
  52. unstructured_ingest/v2/processes/connector_registry.py +8 -2
  53. unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
  54. unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +45 -35
  55. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +30 -27
  56. unstructured_ingest/v2/processes/connectors/chroma.py +30 -21
  57. unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
  58. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +87 -32
  59. unstructured_ingest/v2/processes/connectors/elasticsearch.py +70 -45
  60. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +39 -16
  61. unstructured_ingest/v2/processes/connectors/fsspec/box.py +15 -13
  62. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +10 -11
  63. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +20 -34
  64. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +38 -13
  65. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +31 -17
  66. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -28
  67. unstructured_ingest/v2/processes/connectors/google_drive.py +40 -34
  68. unstructured_ingest/v2/processes/connectors/local.py +22 -14
  69. unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
  70. unstructured_ingest/v2/processes/connectors/mongodb.py +22 -18
  71. unstructured_ingest/v2/processes/connectors/onedrive.py +17 -14
  72. unstructured_ingest/v2/processes/connectors/opensearch.py +66 -56
  73. unstructured_ingest/v2/processes/connectors/pinecone.py +23 -20
  74. unstructured_ingest/v2/processes/connectors/salesforce.py +26 -18
  75. unstructured_ingest/v2/processes/connectors/sharepoint.py +51 -26
  76. unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
  77. unstructured_ingest/v2/processes/connectors/sql.py +29 -31
  78. unstructured_ingest/v2/processes/connectors/weaviate.py +22 -13
  79. unstructured_ingest/v2/processes/embedder.py +106 -47
  80. unstructured_ingest/v2/processes/filter.py +11 -5
  81. unstructured_ingest/v2/processes/partitioner.py +79 -33
  82. unstructured_ingest/v2/processes/uncompress.py +3 -3
  83. unstructured_ingest/v2/utils.py +45 -0
  84. unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
  85. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +89 -116
  86. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
  87. unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
  88. unstructured_ingest/v2/cli/cmds/astra.py +0 -85
  89. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
  90. unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
  91. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
  92. unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
  93. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
  94. unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
  95. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
  96. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -69
  97. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
  98. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
  99. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
  100. unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
  101. unstructured_ingest/v2/cli/cmds/local.py +0 -52
  102. unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
  103. unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
  104. unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
  105. unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
  106. unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
  107. unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
  108. unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
  109. unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
  110. unstructured_ingest/v2/cli/cmds/sql.py +0 -84
  111. unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
  112. unstructured_ingest/v2/cli/configs/__init__.py +0 -13
  113. unstructured_ingest/v2/cli/configs/chunk.py +0 -89
  114. unstructured_ingest/v2/cli/configs/embed.py +0 -74
  115. unstructured_ingest/v2/cli/configs/filter.py +0 -28
  116. unstructured_ingest/v2/cli/configs/partition.py +0 -99
  117. unstructured_ingest/v2/cli/configs/processor.py +0 -88
  118. unstructured_ingest/v2/cli/interfaces.py +0 -27
  119. unstructured_ingest/v2/pipeline/utils.py +0 -15
  120. unstructured_ingest-0.0.3.dist-info/METADATA +0 -175
  121. /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  122. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
  123. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
@@ -11,19 +11,20 @@ from abc import ABC, abstractmethod
11
11
  from dataclasses import InitVar, dataclass, field
12
12
  from datetime import datetime
13
13
  from pathlib import Path
14
- from typing import Any, Optional, Type, TypeVar
14
+ from typing import TYPE_CHECKING, Any, Optional, Type, TypeVar
15
15
 
16
16
  from dataclasses_json import DataClassJsonMixin
17
17
  from dataclasses_json.core import Json, _decode_dataclass
18
- from unstructured.documents.elements import DataSourceMetadata
19
- from unstructured.embed.interfaces import BaseEmbeddingEncoder, Element
20
- from unstructured.partition.api import partition_via_api
21
- from unstructured.staging.base import elements_to_dicts, flatten_dict
22
18
 
23
19
  from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
24
20
  from unstructured_ingest.enhanced_dataclass.core import _asdict
25
21
  from unstructured_ingest.error import PartitionError, SourceConnectionError
26
22
  from unstructured_ingest.logger import logger
23
+ from unstructured_ingest.utils.data_prep import flatten_dict
24
+
25
+ if TYPE_CHECKING:
26
+ from unstructured.documents.elements import Element
27
+ from unstructured.embed.interfaces import BaseEmbeddingEncoder
27
28
 
28
29
  A = TypeVar("A", bound="DataClassJsonMixin")
29
30
 
@@ -195,7 +196,7 @@ class EmbeddingConfig(BaseConfig):
195
196
  aws_secret_access_key: Optional[str] = None
196
197
  aws_region: Optional[str] = None
197
198
 
198
- def get_embedder(self) -> BaseEmbeddingEncoder:
199
+ def get_embedder(self) -> "BaseEmbeddingEncoder":
199
200
  kwargs: dict[str, Any] = {}
200
201
  if self.api_key:
201
202
  kwargs["api_key"] = self.api_key
@@ -551,7 +552,8 @@ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
551
552
  self,
552
553
  partition_config: PartitionConfig,
553
554
  **partition_kwargs,
554
- ) -> list[Element]:
555
+ ) -> list["Element"]:
556
+ from unstructured.documents.elements import DataSourceMetadata
555
557
  from unstructured.partition.auto import partition
556
558
 
557
559
  if not partition_config.partition_by_api:
@@ -570,6 +572,8 @@ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
570
572
  **partition_kwargs,
571
573
  )
572
574
  else:
575
+ from unstructured.partition.api import partition_via_api
576
+
573
577
  endpoint = partition_config.partition_endpoint
574
578
 
575
579
  logger.debug(f"Using remote partition ({endpoint})")
@@ -595,7 +599,7 @@ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
595
599
  logger.info(f"Processing {self.filename}")
596
600
 
597
601
  elements = self.partition_file(partition_config=partition_config, **partition_kwargs)
598
- element_dicts = elements_to_dicts(elements)
602
+ element_dicts = [e.to_dict() for e in elements]
599
603
 
600
604
  self.isd_elems_no_filename: list[dict[str, Any]] = []
601
605
  for elem in element_dicts:
@@ -736,7 +740,7 @@ class BaseDestinationConnector(BaseConnector, ABC):
736
740
  elements_dict_normalized = [self.normalize_dict(element_dict=d) for d in elements_dict]
737
741
  return self.write_dict(*args, elements_dict=elements_dict_normalized, **kwargs)
738
742
 
739
- def write_elements(self, elements: list[Element], *args, **kwargs) -> None:
743
+ def write_elements(self, elements: list["Element"], *args, **kwargs) -> None:
740
744
  elements_dict = [e.to_dict() for e in elements]
741
745
  self.modify_and_write_dict(*args, elements_dict=elements_dict, **kwargs)
742
746
 
@@ -8,11 +8,9 @@ from dataclasses import dataclass, field
8
8
  from multiprocessing.managers import DictProxy
9
9
  from pathlib import Path
10
10
 
11
- import backoff
12
11
  from dataclasses_json import DataClassJsonMixin
13
12
 
14
13
  from unstructured_ingest.error import SourceConnectionNetworkError
15
- from unstructured_ingest.ingest_backoff import RetryHandler
16
14
  from unstructured_ingest.interfaces import (
17
15
  BaseDestinationConnector,
18
16
  BaseSourceConnector,
@@ -23,6 +21,9 @@ from unstructured_ingest.interfaces import (
23
21
  )
24
22
  from unstructured_ingest.logger import ingest_log_streaming_init, logger
25
23
 
24
+ if t.TYPE_CHECKING:
25
+ from unstructured_ingest.ingest_backoff import RetryHandler
26
+
26
27
 
27
28
  @dataclass
28
29
  class PipelineContext(ProcessorConfig):
@@ -147,8 +148,12 @@ class SourceNode(PipelineNode):
147
148
  retry_strategy_config: t.Optional[RetryStrategyConfig] = None
148
149
 
149
150
  @property
150
- def retry_strategy(self) -> t.Optional[RetryHandler]:
151
+ def retry_strategy(self) -> t.Optional["RetryHandler"]:
151
152
  if retry_strategy_config := self.retry_strategy_config:
153
+ import backoff
154
+
155
+ from unstructured_ingest.ingest_backoff import RetryHandler
156
+
152
157
  return RetryHandler(
153
158
  backoff.expo,
154
159
  SourceConnectionNetworkError,
@@ -5,16 +5,15 @@ import json
5
5
  import os.path
6
6
  from dataclasses import dataclass
7
7
  from pathlib import Path
8
- from typing import Optional
9
-
10
- from unstructured.chunking import dispatch
11
- from unstructured.documents.elements import Element, assign_and_map_hash_ids
12
- from unstructured.partition.api import partition_via_api
13
- from unstructured.staging.base import elements_from_json, elements_to_dicts
8
+ from typing import TYPE_CHECKING, Optional
14
9
 
15
10
  from unstructured_ingest.interfaces import ChunkingConfig, PartitionConfig
16
11
  from unstructured_ingest.logger import logger
17
12
  from unstructured_ingest.pipeline.interfaces import ReformatNode
13
+ from unstructured_ingest.utils.chunking import assign_and_map_hash_ids
14
+
15
+ if TYPE_CHECKING:
16
+ from unstructured.documents.elements import Element
18
17
 
19
18
 
20
19
  @dataclass
@@ -69,9 +68,9 @@ class Chunker(ReformatNode):
69
68
  logger.info(f"chunking_strategy is None, skipping chunking for {filename_ext}")
70
69
  return
71
70
 
72
- assign_and_map_hash_ids(chunked_elements)
71
+ element_dicts = [e.to_dict() for e in chunked_elements]
72
+ assign_and_map_hash_ids(elements=element_dicts)
73
73
 
74
- element_dicts = elements_to_dicts(chunked_elements)
75
74
  with open(json_path, "w", encoding="utf8") as output_f:
76
75
  logger.info(f"writing chunking content to {json_path}")
77
76
  json.dump(element_dicts, output_f, ensure_ascii=False, indent=2)
@@ -86,13 +85,16 @@ class Chunker(ReformatNode):
86
85
  def get_path(self) -> Path:
87
86
  return (Path(self.pipeline_context.work_dir) / "chunked").resolve()
88
87
 
89
- def chunk(self, elements_json_file: str) -> Optional[list[Element]]:
88
+ def chunk(self, elements_json_file: str) -> Optional[list["Element"]]:
90
89
  """Called by Chunker.run() to properly execute the defined chunking_strategy."""
91
90
  # -- No chunking_strategy means no chunking --
92
91
  if self.chunking_config.chunking_strategy is None:
93
92
  return
94
93
  # -- Chunk locally for open-source chunking strategies, even when partitioning remotely --
95
94
  if self.chunking_config.chunking_strategy in ("basic", "by_title"):
95
+ from unstructured.chunking import dispatch
96
+ from unstructured.staging.base import elements_from_json
97
+
96
98
  return dispatch.chunk(
97
99
  elements=elements_from_json(filename=elements_json_file),
98
100
  chunking_strategy=self.chunking_config.chunking_strategy,
@@ -106,6 +108,8 @@ class Chunker(ReformatNode):
106
108
  )
107
109
  # -- Chunk remotely --
108
110
  if self.partition_config.partition_by_api:
111
+ from unstructured.partition.api import partition_via_api
112
+
109
113
  return partition_via_api(
110
114
  filename=elements_json_file,
111
115
  # -- (jennings) If api_key or api_url are None, partition_via_api will raise an
@@ -5,8 +5,6 @@ from dataclasses import dataclass
5
5
  from pathlib import Path
6
6
  from typing import Optional
7
7
 
8
- from unstructured.staging.base import elements_from_json, elements_to_dicts
9
-
10
8
  from unstructured_ingest.interfaces import (
11
9
  EmbeddingConfig,
12
10
  )
@@ -29,6 +27,8 @@ class Embedder(ReformatNode):
29
27
  return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
30
28
 
31
29
  def run(self, elements_json: str) -> Optional[str]:
30
+ from unstructured.staging.base import elements_from_json
31
+
32
32
  try:
33
33
  elements_json_filename = os.path.basename(elements_json)
34
34
  filename_ext = os.path.basename(elements_json_filename)
@@ -51,7 +51,7 @@ class Embedder(ReformatNode):
51
51
  elements = elements_from_json(filename=elements_json)
52
52
  embedder = self.embedder_config.get_embedder()
53
53
  embedded_elements = embedder.embed_documents(elements=elements)
54
- element_dicts = elements_to_dicts(embedded_elements)
54
+ element_dicts = [e.to_dict() for e in embedded_elements]
55
55
  with open(json_path, "w", encoding="utf8") as output_f:
56
56
  logger.info(f"writing embeddings content to {json_path}")
57
57
  json.dump(element_dicts, output_f, ensure_ascii=False, indent=2)
@@ -2,7 +2,7 @@ import typing as t
2
2
  from typing import Type
3
3
 
4
4
  from .airtable import AirtableRunner
5
- from .astra import AstraRunner
5
+ from .astradb import AstraDBRunner
6
6
  from .base_runner import Runner
7
7
  from .biomed import BiomedRunner
8
8
  from .confluence import ConfluenceRunner
@@ -36,7 +36,7 @@ from .wikipedia import WikipediaRunner
36
36
 
37
37
  runner_map: t.Dict[str, Type[Runner]] = {
38
38
  "airtable": AirtableRunner,
39
- "astra": AstraRunner,
39
+ "astradb": AstraDBRunner,
40
40
  "azure": AzureRunner,
41
41
  "biomed": BiomedRunner,
42
42
  "box": BoxRunner,
@@ -8,27 +8,27 @@ from unstructured_ingest.runner.base_runner import Runner
8
8
  from unstructured_ingest.runner.utils import update_download_dir_hash
9
9
 
10
10
  if t.TYPE_CHECKING:
11
- from unstructured_ingest.connector.astra import SimpleAstraConfig
11
+ from unstructured_ingest.connector.astradb import SimpleAstraDBConfig
12
12
 
13
13
 
14
14
  @dataclass
15
- class AstraRunner(Runner):
16
- connector_config: "SimpleAstraConfig"
15
+ class AstraDBRunner(Runner):
16
+ connector_config: "SimpleAstraDBConfig"
17
17
 
18
18
  def update_read_config(self):
19
19
  hashed_dir_name = hashlib.sha256(
20
20
  str(self.connector_config.access_config.api_endpoint).encode("utf-8"),
21
21
  )
22
22
  self.read_config.download_dir = update_download_dir_hash(
23
- connector_name="astra",
23
+ connector_name="astradb",
24
24
  read_config=self.read_config,
25
25
  hashed_dir_name=hashed_dir_name,
26
26
  logger=logger,
27
27
  )
28
28
 
29
29
  def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
30
- from unstructured_ingest.connector.astra import (
31
- AstraSourceConnector,
30
+ from unstructured_ingest.connector.astradb import (
31
+ AstraDBSourceConnector,
32
32
  )
33
33
 
34
- return AstraSourceConnector
34
+ return AstraDBSourceConnector
@@ -1,6 +1,6 @@
1
1
  import typing as t
2
2
 
3
- from .astra import AstraWriter
3
+ from .astradb import AstraDBWriter
4
4
  from .azure_cognitive_search import AzureCognitiveSearchWriter
5
5
  from .base_writer import Writer
6
6
  from .chroma import ChromaWriter
@@ -23,7 +23,7 @@ from .vectara import VectaraWriter
23
23
  from .weaviate import WeaviateWriter
24
24
 
25
25
  writer_map: t.Dict[str, t.Type[Writer]] = {
26
- "astra": AstraWriter,
26
+ "astradb": AstraDBWriter,
27
27
  "azure": AzureWriter,
28
28
  "azure_cognitive_search": AzureCognitiveSearchWriter,
29
29
  "box": BoxWriter,
@@ -6,17 +6,17 @@ from unstructured_ingest.interfaces import BaseDestinationConnector
6
6
  from unstructured_ingest.runner.writers.base_writer import Writer
7
7
 
8
8
  if t.TYPE_CHECKING:
9
- from unstructured_ingest.connector.astra import AstraWriteConfig, SimpleAstraConfig
9
+ from unstructured_ingest.connector.astradb import AstraDBWriteConfig, SimpleAstraDBConfig
10
10
 
11
11
 
12
12
  @dataclass
13
- class AstraWriter(Writer, EnhancedDataClassJsonMixin):
14
- write_config: "AstraWriteConfig"
15
- connector_config: "SimpleAstraConfig"
13
+ class AstraDBWriter(Writer, EnhancedDataClassJsonMixin):
14
+ write_config: "AstraDBWriteConfig"
15
+ connector_config: "SimpleAstraDBConfig"
16
16
 
17
17
  def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
18
- from unstructured_ingest.connector.astra import (
19
- AstraDestinationConnector,
18
+ from unstructured_ingest.connector.astradb import (
19
+ AstraDBDestinationConnector,
20
20
  )
21
21
 
22
- return AstraDestinationConnector
22
+ return AstraDBDestinationConnector
@@ -0,0 +1,45 @@
1
+ import hashlib
2
+ from itertools import groupby
3
+
4
+
5
+ def id_to_hash(element: dict, sequence_number: int) -> str:
6
+ """Calculates and assigns a deterministic hash as an ID.
7
+
8
+ The hash ID is based on element's text, sequence number on page,
9
+ page number and its filename.
10
+
11
+ Args:
12
+ sequence_number: index on page
13
+
14
+ Returns: new ID value
15
+ """
16
+ filename = element["metadata"].get("filename")
17
+ text = element["text"]
18
+ page_number = element["metadata"].get("page_number")
19
+ data = f"{filename}{text}{page_number}{sequence_number}"
20
+ element["element_id"] = hashlib.sha256(data.encode()).hexdigest()[:32]
21
+ return element["element_id"]
22
+
23
+
24
+ def assign_and_map_hash_ids(elements: list[dict]) -> list[dict]:
25
+ # -- generate sequence number for each element on a page --
26
+ elements = elements.copy()
27
+ page_numbers = [e["metadata"].get("page_number") for e in elements]
28
+ page_seq_pairs = [
29
+ seq_on_page for page, group in groupby(page_numbers) for seq_on_page, _ in enumerate(group)
30
+ ]
31
+
32
+ # -- assign hash IDs to elements --
33
+ old_to_new_mapping = {
34
+ element["element_id"]: id_to_hash(element=element, sequence_number=seq_on_page_counter)
35
+ for element, seq_on_page_counter in zip(elements, page_seq_pairs)
36
+ }
37
+
38
+ # -- map old parent IDs to new ones --
39
+ for e in elements:
40
+ parent_id = e["metadata"].get("parent_id")
41
+ if not parent_id:
42
+ continue
43
+ e["metadata"]["parent_id"] = old_to_new_mapping[parent_id]
44
+
45
+ return elements
@@ -33,7 +33,7 @@ def requires_dependencies(
33
33
  raise ImportError(
34
34
  f"Following dependencies are missing: {', '.join(missing_deps)}. "
35
35
  + (
36
- f"""Please install them using `pip install "unstructured[{extras}]"`."""
36
+ f"""Please install them using `pip install "unstructured-ingest[{extras}]"`.""" # noqa: E501
37
37
  if extras
38
38
  else f"Please install them using `pip install {' '.join(missing_deps)}`."
39
39
  ),
@@ -0,0 +1,9 @@
1
+ GOOGLE_DRIVE_EXPORT_TYPES = {
2
+ "application/vnd.google-apps.document": "application/"
3
+ "vnd.openxmlformats-officedocument.wordprocessingml.document",
4
+ "application/vnd.google-apps.spreadsheet": "application/"
5
+ "vnd.openxmlformats-officedocument.spreadsheetml.sheet",
6
+ "application/vnd.google-apps.presentation": "application/"
7
+ "vnd.openxmlformats-officedocument.presentationml.presentation",
8
+ "application/vnd.google-apps.photo": "image/jpeg",
9
+ }
@@ -1,13 +1,15 @@
1
1
  import inspect
2
2
  from abc import ABC, abstractmethod
3
+ from collections import Counter
3
4
  from dataclasses import dataclass, field, fields
4
5
  from typing import Any, Optional, Type, TypeVar
5
6
 
6
7
  import click
8
+ from pydantic import BaseModel
7
9
 
8
10
  from unstructured_ingest.v2.cli.base.importer import import_from_string
9
- from unstructured_ingest.v2.cli.interfaces import CliConfig
10
- from unstructured_ingest.v2.cli.utils import extract_config
11
+ from unstructured_ingest.v2.cli.utils.click import extract_config
12
+ from unstructured_ingest.v2.cli.utils.model_conversion import options_from_base_model, post_check
11
13
  from unstructured_ingest.v2.interfaces import ProcessorConfig
12
14
  from unstructured_ingest.v2.logger import logger
13
15
  from unstructured_ingest.v2.pipeline.pipeline import Pipeline
@@ -15,6 +17,7 @@ from unstructured_ingest.v2.processes.chunker import Chunker, ChunkerConfig
15
17
  from unstructured_ingest.v2.processes.connector_registry import (
16
18
  DownloaderT,
17
19
  IndexerT,
20
+ RegistryEntry,
18
21
  UploaderT,
19
22
  UploadStager,
20
23
  UploadStagerConfig,
@@ -33,7 +36,52 @@ CommandT = TypeVar("CommandT", bound=click.Command)
33
36
  @dataclass
34
37
  class BaseCmd(ABC):
35
38
  cmd_name: str
36
- default_configs: list[Type[CliConfig]] = field(default_factory=list)
39
+ registry_entry: RegistryEntry
40
+ default_configs: list[Type[BaseModel]] = field(default_factory=list)
41
+
42
+ @abstractmethod
43
+ def get_registry_options(self):
44
+ pass
45
+
46
+ def get_default_options(self) -> list[click.Option]:
47
+ options = []
48
+ for extra in self.default_configs:
49
+ options.extend(options_from_base_model(model=extra))
50
+ return options
51
+
52
+ @classmethod
53
+ def consolidate_options(cls, options: list[click.Option]) -> list[click.Option]:
54
+ option_names = [option.name for option in options]
55
+ duplicate_names = [name for name, count in Counter(option_names).items() if count > 1]
56
+ if not duplicate_names:
57
+ return options
58
+ consolidated_options = []
59
+ current_names = []
60
+ for option in options:
61
+ if option.name not in current_names:
62
+ current_names.append(option.name)
63
+ consolidated_options.append(option)
64
+ continue
65
+ existing_option = next(o for o in consolidated_options if o.name == option.name)
66
+ if existing_option.__dict__ == option.__dict__:
67
+ continue
68
+ option_diff = cls.get_options_diff(o1=option, o2=existing_option)
69
+ raise ValueError(
70
+ "Conflicting duplicate {} option defined: {}".format(
71
+ option.name, " | ".join([f"{d[0]}: {d[1]}" for d in option_diff])
72
+ )
73
+ )
74
+ return consolidated_options
75
+
76
+ @staticmethod
77
+ def get_options_diff(o1: click.Option, o2: click.Option):
78
+ o1_dict = o1.__dict__
79
+ o2_dict = o2.__dict__
80
+ for d in [o1_dict, o2_dict]:
81
+ d["opts"] = ",".join(d["opts"])
82
+ d["secondary_opts"] = ",".join(d["secondary_opts"])
83
+ option_diff = set(o1_dict.items()) ^ set(o2_dict.items())
84
+ return option_diff
37
85
 
38
86
  @property
39
87
  def cmd_name_key(self):
@@ -47,15 +95,11 @@ class BaseCmd(ABC):
47
95
  def cmd(self, ctx: click.Context, **options) -> None:
48
96
  pass
49
97
 
50
- def add_options(self, cmd: CommandT, extras: list[Type[CliConfig]]) -> CommandT:
51
- configs = self.default_configs
52
- # make sure what's unique to this cmd appears first
53
- extras.extend(configs)
54
- for config in extras:
55
- try:
56
- config.add_cli_options(cmd=cmd)
57
- except ValueError as e:
58
- raise ValueError(f"failed to set configs from {config.__name__}: {e}")
98
+ def add_options(self, cmd: CommandT) -> CommandT:
99
+ options = self.get_registry_options()
100
+ options.extend(self.get_default_options())
101
+ post_check(options)
102
+ cmd.params.extend(options)
59
103
  return cmd
60
104
 
61
105
  def get_pipline(
@@ -111,7 +155,7 @@ class BaseCmd(ABC):
111
155
  @staticmethod
112
156
  def get_filterer(options: dict[str, Any]) -> Optional[Filterer]:
113
157
  filterer_configs = extract_config(flat_data=options, config=FiltererConfig)
114
- if not filterer_configs.to_dict():
158
+ if not filterer_configs.dict():
115
159
  return None
116
160
  return Filterer(config=filterer_configs)
117
161
 
@@ -1,20 +1,34 @@
1
1
  import logging
2
2
  from dataclasses import dataclass
3
- from typing import Optional, Type
4
3
 
5
4
  import click
6
5
 
7
6
  from unstructured_ingest.v2.cli.base.cmd import BaseCmd
8
- from unstructured_ingest.v2.cli.interfaces import CliConfig
9
- from unstructured_ingest.v2.cli.utils import Dict, conform_click_options
7
+ from unstructured_ingest.v2.cli.utils.click import Dict, conform_click_options
8
+ from unstructured_ingest.v2.cli.utils.model_conversion import options_from_base_model
10
9
  from unstructured_ingest.v2.logger import logger
10
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
11
11
 
12
12
 
13
13
  @dataclass
14
14
  class DestCmd(BaseCmd):
15
- connection_config: Optional[Type[CliConfig]] = None
16
- uploader_config: Optional[Type[CliConfig]] = None
17
- upload_stager_config: Optional[Type[CliConfig]] = None
15
+ registry_entry: DestinationRegistryEntry
16
+
17
+ def get_registry_options(self):
18
+ options = []
19
+ configs = [
20
+ config
21
+ for config in [
22
+ self.registry_entry.uploader_config,
23
+ self.registry_entry.upload_stager_config,
24
+ self.registry_entry.connection_config,
25
+ ]
26
+ if config
27
+ ]
28
+ for config in configs:
29
+ options.extend(options_from_base_model(model=config))
30
+ options = self.consolidate_options(options=options)
31
+ return options
18
32
 
19
33
  def cmd(self, ctx: click.Context, **options) -> None:
20
34
  logger.setLevel(logging.DEBUG if options.get("verbose", False) else logging.INFO)
@@ -47,12 +61,7 @@ class DestCmd(BaseCmd):
47
61
  cmd.name = self.cli_cmd_name
48
62
  cmd.short_help = "v2"
49
63
  cmd.invoke_without_command = True
50
- extras = [
51
- x
52
- for x in [self.uploader_config, self.upload_stager_config, self.connection_config]
53
- if x
54
- ]
55
- self.add_options(cmd, extras=extras)
64
+ self.add_options(cmd)
56
65
  cmd.params.append(
57
66
  click.Option(
58
67
  ["--custom-stager"],
@@ -1,37 +1,52 @@
1
1
  import logging
2
2
  from dataclasses import dataclass, field
3
- from typing import Optional, Type
4
3
 
5
4
  import click
5
+ from pydantic import BaseModel
6
6
 
7
7
  from unstructured_ingest.v2.cli.base.cmd import BaseCmd
8
- from unstructured_ingest.v2.cli.configs import (
9
- ChunkerCliConfig,
10
- EmbedderCliConfig,
11
- FilterCliConfig,
12
- PartitionerCliConfig,
13
- ProcessorCliConfig,
14
- )
15
- from unstructured_ingest.v2.cli.interfaces import CliConfig
16
- from unstructured_ingest.v2.cli.utils import Group, conform_click_options
8
+ from unstructured_ingest.v2.cli.utils.click import Group, conform_click_options
9
+ from unstructured_ingest.v2.cli.utils.model_conversion import options_from_base_model
10
+ from unstructured_ingest.v2.interfaces import ProcessorConfig
17
11
  from unstructured_ingest.v2.logger import logger
12
+ from unstructured_ingest.v2.processes import (
13
+ ChunkerConfig,
14
+ EmbedderConfig,
15
+ FiltererConfig,
16
+ PartitionerConfig,
17
+ )
18
+ from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
18
19
 
19
20
 
20
21
  @dataclass
21
22
  class SrcCmd(BaseCmd):
22
- indexer_config: Optional[Type[CliConfig]] = None
23
- downloader_config: Optional[Type[CliConfig]] = None
24
- connection_config: Optional[Type[CliConfig]] = None
25
- default_configs: list[CliConfig] = field(
23
+ registry_entry: SourceRegistryEntry
24
+ default_configs: list[BaseModel] = field(
26
25
  default_factory=lambda: [
27
- ProcessorCliConfig,
28
- PartitionerCliConfig,
29
- EmbedderCliConfig,
30
- FilterCliConfig,
31
- ChunkerCliConfig,
26
+ ProcessorConfig,
27
+ PartitionerConfig,
28
+ EmbedderConfig,
29
+ FiltererConfig,
30
+ ChunkerConfig,
32
31
  ]
33
32
  )
34
33
 
34
+ def get_registry_options(self):
35
+ options = []
36
+ configs = [
37
+ config
38
+ for config in [
39
+ self.registry_entry.connection_config,
40
+ self.registry_entry.indexer_config,
41
+ self.registry_entry.downloader_config,
42
+ ]
43
+ if config
44
+ ]
45
+ for config in configs:
46
+ options.extend(options_from_base_model(model=config))
47
+ options = self.consolidate_options(options=options)
48
+ return options
49
+
35
50
  def cmd(self, ctx: click.Context, **options) -> None:
36
51
  if ctx.invoked_subcommand:
37
52
  return
@@ -55,10 +70,7 @@ class SrcCmd(BaseCmd):
55
70
  cmd.name = self.cli_cmd_name
56
71
  cmd.short_help = "v2"
57
72
  cmd.invoke_without_command = True
58
- extras = [
59
- x for x in [self.indexer_config, self.downloader_config, self.connection_config] if x
60
- ]
61
- self.add_options(cmd, extras=extras)
73
+ self.add_options(cmd)
62
74
 
63
75
  # TODO remove after v1 no longer supported
64
76
  cmd.params.append(
@@ -0,0 +1,14 @@
1
+ import click
2
+
3
+ from unstructured_ingest.v2.cli.base import DestCmd, SrcCmd
4
+ from unstructured_ingest.v2.processes.connector_registry import (
5
+ destination_registry,
6
+ source_registry,
7
+ )
8
+
9
+ src_cmds = [SrcCmd(cmd_name=k, registry_entry=v) for k, v in source_registry.items()]
10
+ dest_cmds = [DestCmd(cmd_name=k, registry_entry=v) for k, v in destination_registry.items()]
11
+
12
+ src: list[click.Group] = [v.get_cmd() for v in src_cmds]
13
+
14
+ dest: list[click.Command] = [v.get_cmd() for v in dest_cmds]