unstructured-ingest 0.0.2.dev0__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cli.py +6 -1
- unstructured_ingest/cli/cmds/__init__.py +4 -4
- unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
- unstructured_ingest/cli/interfaces.py +13 -6
- unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
- unstructured_ingest/connector/biomed.py +12 -5
- unstructured_ingest/connector/confluence.py +3 -3
- unstructured_ingest/connector/github.py +3 -2
- unstructured_ingest/connector/google_drive.py +1 -2
- unstructured_ingest/connector/mongodb.py +1 -2
- unstructured_ingest/connector/notion/client.py +31 -16
- unstructured_ingest/connector/notion/connector.py +3 -2
- unstructured_ingest/connector/registry.py +2 -2
- unstructured_ingest/connector/vectara.py +7 -2
- unstructured_ingest/interfaces.py +13 -9
- unstructured_ingest/pipeline/interfaces.py +8 -3
- unstructured_ingest/pipeline/reformat/chunking.py +13 -9
- unstructured_ingest/pipeline/reformat/embedding.py +3 -3
- unstructured_ingest/runner/__init__.py +2 -2
- unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
- unstructured_ingest/runner/writers/__init__.py +2 -2
- unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
- unstructured_ingest/utils/chunking.py +45 -0
- unstructured_ingest/utils/dep_check.py +1 -1
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/v2/cli/base/cmd.py +66 -12
- unstructured_ingest/v2/cli/base/dest.py +21 -12
- unstructured_ingest/v2/cli/base/src.py +35 -21
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
- unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
- unstructured_ingest/v2/interfaces/__init__.py +2 -1
- unstructured_ingest/v2/interfaces/connector.py +5 -7
- unstructured_ingest/v2/interfaces/downloader.py +17 -8
- unstructured_ingest/v2/interfaces/file_data.py +13 -2
- unstructured_ingest/v2/interfaces/indexer.py +3 -4
- unstructured_ingest/v2/interfaces/process.py +3 -4
- unstructured_ingest/v2/interfaces/processor.py +10 -10
- unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
- unstructured_ingest/v2/interfaces/uploader.py +3 -3
- unstructured_ingest/v2/pipeline/interfaces.py +3 -5
- unstructured_ingest/v2/pipeline/pipeline.py +73 -7
- unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
- unstructured_ingest/v2/pipeline/steps/download.py +90 -24
- unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
- unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
- unstructured_ingest/v2/pipeline/steps/index.py +14 -10
- unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
- unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
- unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +74 -28
- unstructured_ingest/v2/processes/connector_registry.py +8 -2
- unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
- unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +53 -35
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +38 -27
- unstructured_ingest/v2/processes/connectors/chroma.py +38 -27
- unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +95 -31
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +92 -53
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +47 -16
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +23 -13
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +18 -11
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +49 -61
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +46 -13
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +50 -20
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +27 -28
- unstructured_ingest/v2/processes/connectors/google_drive.py +52 -42
- unstructured_ingest/v2/processes/connectors/local.py +36 -28
- unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
- unstructured_ingest/v2/processes/connectors/mongodb.py +32 -22
- unstructured_ingest/v2/processes/connectors/onedrive.py +31 -16
- unstructured_ingest/v2/processes/connectors/opensearch.py +81 -43
- unstructured_ingest/v2/processes/connectors/pinecone.py +29 -23
- unstructured_ingest/v2/processes/connectors/salesforce.py +36 -26
- unstructured_ingest/v2/processes/connectors/sharepoint.py +64 -33
- unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
- unstructured_ingest/v2/processes/connectors/sql.py +52 -39
- unstructured_ingest/v2/processes/connectors/weaviate.py +35 -18
- unstructured_ingest/v2/processes/embedder.py +106 -47
- unstructured_ingest/v2/processes/filter.py +60 -0
- unstructured_ingest/v2/processes/partitioner.py +79 -33
- unstructured_ingest/v2/processes/uncompress.py +3 -3
- unstructured_ingest/v2/utils.py +45 -0
- unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +92 -116
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
- unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
- unstructured_ingest/v2/cli/cmds/astra.py +0 -85
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
- unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -77
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
- unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
- unstructured_ingest/v2/cli/cmds/local.py +0 -60
- unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
- unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
- unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
- unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
- unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
- unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
- unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
- unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
- unstructured_ingest/v2/cli/cmds/sql.py +0 -84
- unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
- unstructured_ingest/v2/cli/configs/__init__.py +0 -6
- unstructured_ingest/v2/cli/configs/chunk.py +0 -89
- unstructured_ingest/v2/cli/configs/embed.py +0 -74
- unstructured_ingest/v2/cli/configs/partition.py +0 -99
- unstructured_ingest/v2/cli/configs/processor.py +0 -88
- unstructured_ingest/v2/cli/interfaces.py +0 -27
- unstructured_ingest/v2/pipeline/utils.py +0 -15
- unstructured_ingest-0.0.2.dev0.dist-info/METADATA +0 -321
- /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
|
@@ -11,19 +11,20 @@ from abc import ABC, abstractmethod
|
|
|
11
11
|
from dataclasses import InitVar, dataclass, field
|
|
12
12
|
from datetime import datetime
|
|
13
13
|
from pathlib import Path
|
|
14
|
-
from typing import Any, Optional, Type, TypeVar
|
|
14
|
+
from typing import TYPE_CHECKING, Any, Optional, Type, TypeVar
|
|
15
15
|
|
|
16
16
|
from dataclasses_json import DataClassJsonMixin
|
|
17
17
|
from dataclasses_json.core import Json, _decode_dataclass
|
|
18
|
-
from unstructured.documents.elements import DataSourceMetadata
|
|
19
|
-
from unstructured.embed.interfaces import BaseEmbeddingEncoder, Element
|
|
20
|
-
from unstructured.partition.api import partition_via_api
|
|
21
|
-
from unstructured.staging.base import elements_to_dicts, flatten_dict
|
|
22
18
|
|
|
23
19
|
from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
|
|
24
20
|
from unstructured_ingest.enhanced_dataclass.core import _asdict
|
|
25
21
|
from unstructured_ingest.error import PartitionError, SourceConnectionError
|
|
26
22
|
from unstructured_ingest.logger import logger
|
|
23
|
+
from unstructured_ingest.utils.data_prep import flatten_dict
|
|
24
|
+
|
|
25
|
+
if TYPE_CHECKING:
|
|
26
|
+
from unstructured.documents.elements import Element
|
|
27
|
+
from unstructured.embed.interfaces import BaseEmbeddingEncoder
|
|
27
28
|
|
|
28
29
|
A = TypeVar("A", bound="DataClassJsonMixin")
|
|
29
30
|
|
|
@@ -195,7 +196,7 @@ class EmbeddingConfig(BaseConfig):
|
|
|
195
196
|
aws_secret_access_key: Optional[str] = None
|
|
196
197
|
aws_region: Optional[str] = None
|
|
197
198
|
|
|
198
|
-
def get_embedder(self) -> BaseEmbeddingEncoder:
|
|
199
|
+
def get_embedder(self) -> "BaseEmbeddingEncoder":
|
|
199
200
|
kwargs: dict[str, Any] = {}
|
|
200
201
|
if self.api_key:
|
|
201
202
|
kwargs["api_key"] = self.api_key
|
|
@@ -551,7 +552,8 @@ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
|
|
|
551
552
|
self,
|
|
552
553
|
partition_config: PartitionConfig,
|
|
553
554
|
**partition_kwargs,
|
|
554
|
-
) -> list[Element]:
|
|
555
|
+
) -> list["Element"]:
|
|
556
|
+
from unstructured.documents.elements import DataSourceMetadata
|
|
555
557
|
from unstructured.partition.auto import partition
|
|
556
558
|
|
|
557
559
|
if not partition_config.partition_by_api:
|
|
@@ -570,6 +572,8 @@ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
|
|
|
570
572
|
**partition_kwargs,
|
|
571
573
|
)
|
|
572
574
|
else:
|
|
575
|
+
from unstructured.partition.api import partition_via_api
|
|
576
|
+
|
|
573
577
|
endpoint = partition_config.partition_endpoint
|
|
574
578
|
|
|
575
579
|
logger.debug(f"Using remote partition ({endpoint})")
|
|
@@ -595,7 +599,7 @@ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
|
|
|
595
599
|
logger.info(f"Processing {self.filename}")
|
|
596
600
|
|
|
597
601
|
elements = self.partition_file(partition_config=partition_config, **partition_kwargs)
|
|
598
|
-
element_dicts =
|
|
602
|
+
element_dicts = [e.to_dict() for e in elements]
|
|
599
603
|
|
|
600
604
|
self.isd_elems_no_filename: list[dict[str, Any]] = []
|
|
601
605
|
for elem in element_dicts:
|
|
@@ -736,7 +740,7 @@ class BaseDestinationConnector(BaseConnector, ABC):
|
|
|
736
740
|
elements_dict_normalized = [self.normalize_dict(element_dict=d) for d in elements_dict]
|
|
737
741
|
return self.write_dict(*args, elements_dict=elements_dict_normalized, **kwargs)
|
|
738
742
|
|
|
739
|
-
def write_elements(self, elements: list[Element], *args, **kwargs) -> None:
|
|
743
|
+
def write_elements(self, elements: list["Element"], *args, **kwargs) -> None:
|
|
740
744
|
elements_dict = [e.to_dict() for e in elements]
|
|
741
745
|
self.modify_and_write_dict(*args, elements_dict=elements_dict, **kwargs)
|
|
742
746
|
|
|
@@ -8,11 +8,9 @@ from dataclasses import dataclass, field
|
|
|
8
8
|
from multiprocessing.managers import DictProxy
|
|
9
9
|
from pathlib import Path
|
|
10
10
|
|
|
11
|
-
import backoff
|
|
12
11
|
from dataclasses_json import DataClassJsonMixin
|
|
13
12
|
|
|
14
13
|
from unstructured_ingest.error import SourceConnectionNetworkError
|
|
15
|
-
from unstructured_ingest.ingest_backoff import RetryHandler
|
|
16
14
|
from unstructured_ingest.interfaces import (
|
|
17
15
|
BaseDestinationConnector,
|
|
18
16
|
BaseSourceConnector,
|
|
@@ -23,6 +21,9 @@ from unstructured_ingest.interfaces import (
|
|
|
23
21
|
)
|
|
24
22
|
from unstructured_ingest.logger import ingest_log_streaming_init, logger
|
|
25
23
|
|
|
24
|
+
if t.TYPE_CHECKING:
|
|
25
|
+
from unstructured_ingest.ingest_backoff import RetryHandler
|
|
26
|
+
|
|
26
27
|
|
|
27
28
|
@dataclass
|
|
28
29
|
class PipelineContext(ProcessorConfig):
|
|
@@ -147,8 +148,12 @@ class SourceNode(PipelineNode):
|
|
|
147
148
|
retry_strategy_config: t.Optional[RetryStrategyConfig] = None
|
|
148
149
|
|
|
149
150
|
@property
|
|
150
|
-
def retry_strategy(self) -> t.Optional[RetryHandler]:
|
|
151
|
+
def retry_strategy(self) -> t.Optional["RetryHandler"]:
|
|
151
152
|
if retry_strategy_config := self.retry_strategy_config:
|
|
153
|
+
import backoff
|
|
154
|
+
|
|
155
|
+
from unstructured_ingest.ingest_backoff import RetryHandler
|
|
156
|
+
|
|
152
157
|
return RetryHandler(
|
|
153
158
|
backoff.expo,
|
|
154
159
|
SourceConnectionNetworkError,
|
|
@@ -5,16 +5,15 @@ import json
|
|
|
5
5
|
import os.path
|
|
6
6
|
from dataclasses import dataclass
|
|
7
7
|
from pathlib import Path
|
|
8
|
-
from typing import Optional
|
|
9
|
-
|
|
10
|
-
from unstructured.chunking import dispatch
|
|
11
|
-
from unstructured.documents.elements import Element, assign_and_map_hash_ids
|
|
12
|
-
from unstructured.partition.api import partition_via_api
|
|
13
|
-
from unstructured.staging.base import elements_from_json, elements_to_dicts
|
|
8
|
+
from typing import TYPE_CHECKING, Optional
|
|
14
9
|
|
|
15
10
|
from unstructured_ingest.interfaces import ChunkingConfig, PartitionConfig
|
|
16
11
|
from unstructured_ingest.logger import logger
|
|
17
12
|
from unstructured_ingest.pipeline.interfaces import ReformatNode
|
|
13
|
+
from unstructured_ingest.utils.chunking import assign_and_map_hash_ids
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from unstructured.documents.elements import Element
|
|
18
17
|
|
|
19
18
|
|
|
20
19
|
@dataclass
|
|
@@ -69,9 +68,9 @@ class Chunker(ReformatNode):
|
|
|
69
68
|
logger.info(f"chunking_strategy is None, skipping chunking for {filename_ext}")
|
|
70
69
|
return
|
|
71
70
|
|
|
72
|
-
|
|
71
|
+
element_dicts = [e.to_dict() for e in chunked_elements]
|
|
72
|
+
assign_and_map_hash_ids(elements=element_dicts)
|
|
73
73
|
|
|
74
|
-
element_dicts = elements_to_dicts(chunked_elements)
|
|
75
74
|
with open(json_path, "w", encoding="utf8") as output_f:
|
|
76
75
|
logger.info(f"writing chunking content to {json_path}")
|
|
77
76
|
json.dump(element_dicts, output_f, ensure_ascii=False, indent=2)
|
|
@@ -86,13 +85,16 @@ class Chunker(ReformatNode):
|
|
|
86
85
|
def get_path(self) -> Path:
|
|
87
86
|
return (Path(self.pipeline_context.work_dir) / "chunked").resolve()
|
|
88
87
|
|
|
89
|
-
def chunk(self, elements_json_file: str) -> Optional[list[Element]]:
|
|
88
|
+
def chunk(self, elements_json_file: str) -> Optional[list["Element"]]:
|
|
90
89
|
"""Called by Chunker.run() to properly execute the defined chunking_strategy."""
|
|
91
90
|
# -- No chunking_strategy means no chunking --
|
|
92
91
|
if self.chunking_config.chunking_strategy is None:
|
|
93
92
|
return
|
|
94
93
|
# -- Chunk locally for open-source chunking strategies, even when partitioning remotely --
|
|
95
94
|
if self.chunking_config.chunking_strategy in ("basic", "by_title"):
|
|
95
|
+
from unstructured.chunking import dispatch
|
|
96
|
+
from unstructured.staging.base import elements_from_json
|
|
97
|
+
|
|
96
98
|
return dispatch.chunk(
|
|
97
99
|
elements=elements_from_json(filename=elements_json_file),
|
|
98
100
|
chunking_strategy=self.chunking_config.chunking_strategy,
|
|
@@ -106,6 +108,8 @@ class Chunker(ReformatNode):
|
|
|
106
108
|
)
|
|
107
109
|
# -- Chunk remotely --
|
|
108
110
|
if self.partition_config.partition_by_api:
|
|
111
|
+
from unstructured.partition.api import partition_via_api
|
|
112
|
+
|
|
109
113
|
return partition_via_api(
|
|
110
114
|
filename=elements_json_file,
|
|
111
115
|
# -- (jennings) If api_key or api_url are None, partition_via_api will raise an
|
|
@@ -5,8 +5,6 @@ from dataclasses import dataclass
|
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from typing import Optional
|
|
7
7
|
|
|
8
|
-
from unstructured.staging.base import elements_from_json, elements_to_dicts
|
|
9
|
-
|
|
10
8
|
from unstructured_ingest.interfaces import (
|
|
11
9
|
EmbeddingConfig,
|
|
12
10
|
)
|
|
@@ -29,6 +27,8 @@ class Embedder(ReformatNode):
|
|
|
29
27
|
return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
|
|
30
28
|
|
|
31
29
|
def run(self, elements_json: str) -> Optional[str]:
|
|
30
|
+
from unstructured.staging.base import elements_from_json
|
|
31
|
+
|
|
32
32
|
try:
|
|
33
33
|
elements_json_filename = os.path.basename(elements_json)
|
|
34
34
|
filename_ext = os.path.basename(elements_json_filename)
|
|
@@ -51,7 +51,7 @@ class Embedder(ReformatNode):
|
|
|
51
51
|
elements = elements_from_json(filename=elements_json)
|
|
52
52
|
embedder = self.embedder_config.get_embedder()
|
|
53
53
|
embedded_elements = embedder.embed_documents(elements=elements)
|
|
54
|
-
element_dicts =
|
|
54
|
+
element_dicts = [e.to_dict() for e in embedded_elements]
|
|
55
55
|
with open(json_path, "w", encoding="utf8") as output_f:
|
|
56
56
|
logger.info(f"writing embeddings content to {json_path}")
|
|
57
57
|
json.dump(element_dicts, output_f, ensure_ascii=False, indent=2)
|
|
@@ -2,7 +2,7 @@ import typing as t
|
|
|
2
2
|
from typing import Type
|
|
3
3
|
|
|
4
4
|
from .airtable import AirtableRunner
|
|
5
|
-
from .
|
|
5
|
+
from .astradb import AstraDBRunner
|
|
6
6
|
from .base_runner import Runner
|
|
7
7
|
from .biomed import BiomedRunner
|
|
8
8
|
from .confluence import ConfluenceRunner
|
|
@@ -36,7 +36,7 @@ from .wikipedia import WikipediaRunner
|
|
|
36
36
|
|
|
37
37
|
runner_map: t.Dict[str, Type[Runner]] = {
|
|
38
38
|
"airtable": AirtableRunner,
|
|
39
|
-
"
|
|
39
|
+
"astradb": AstraDBRunner,
|
|
40
40
|
"azure": AzureRunner,
|
|
41
41
|
"biomed": BiomedRunner,
|
|
42
42
|
"box": BoxRunner,
|
|
@@ -8,27 +8,27 @@ from unstructured_ingest.runner.base_runner import Runner
|
|
|
8
8
|
from unstructured_ingest.runner.utils import update_download_dir_hash
|
|
9
9
|
|
|
10
10
|
if t.TYPE_CHECKING:
|
|
11
|
-
from unstructured_ingest.connector.
|
|
11
|
+
from unstructured_ingest.connector.astradb import SimpleAstraDBConfig
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
@dataclass
|
|
15
|
-
class
|
|
16
|
-
connector_config: "
|
|
15
|
+
class AstraDBRunner(Runner):
|
|
16
|
+
connector_config: "SimpleAstraDBConfig"
|
|
17
17
|
|
|
18
18
|
def update_read_config(self):
|
|
19
19
|
hashed_dir_name = hashlib.sha256(
|
|
20
20
|
str(self.connector_config.access_config.api_endpoint).encode("utf-8"),
|
|
21
21
|
)
|
|
22
22
|
self.read_config.download_dir = update_download_dir_hash(
|
|
23
|
-
connector_name="
|
|
23
|
+
connector_name="astradb",
|
|
24
24
|
read_config=self.read_config,
|
|
25
25
|
hashed_dir_name=hashed_dir_name,
|
|
26
26
|
logger=logger,
|
|
27
27
|
)
|
|
28
28
|
|
|
29
29
|
def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
|
|
30
|
-
from unstructured_ingest.connector.
|
|
31
|
-
|
|
30
|
+
from unstructured_ingest.connector.astradb import (
|
|
31
|
+
AstraDBSourceConnector,
|
|
32
32
|
)
|
|
33
33
|
|
|
34
|
-
return
|
|
34
|
+
return AstraDBSourceConnector
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import typing as t
|
|
2
2
|
|
|
3
|
-
from .
|
|
3
|
+
from .astradb import AstraDBWriter
|
|
4
4
|
from .azure_cognitive_search import AzureCognitiveSearchWriter
|
|
5
5
|
from .base_writer import Writer
|
|
6
6
|
from .chroma import ChromaWriter
|
|
@@ -23,7 +23,7 @@ from .vectara import VectaraWriter
|
|
|
23
23
|
from .weaviate import WeaviateWriter
|
|
24
24
|
|
|
25
25
|
writer_map: t.Dict[str, t.Type[Writer]] = {
|
|
26
|
-
"
|
|
26
|
+
"astradb": AstraDBWriter,
|
|
27
27
|
"azure": AzureWriter,
|
|
28
28
|
"azure_cognitive_search": AzureCognitiveSearchWriter,
|
|
29
29
|
"box": BoxWriter,
|
|
@@ -6,17 +6,17 @@ from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
|
6
6
|
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
7
7
|
|
|
8
8
|
if t.TYPE_CHECKING:
|
|
9
|
-
from unstructured_ingest.connector.
|
|
9
|
+
from unstructured_ingest.connector.astradb import AstraDBWriteConfig, SimpleAstraDBConfig
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
@dataclass
|
|
13
|
-
class
|
|
14
|
-
write_config: "
|
|
15
|
-
connector_config: "
|
|
13
|
+
class AstraDBWriter(Writer, EnhancedDataClassJsonMixin):
|
|
14
|
+
write_config: "AstraDBWriteConfig"
|
|
15
|
+
connector_config: "SimpleAstraDBConfig"
|
|
16
16
|
|
|
17
17
|
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
18
|
-
from unstructured_ingest.connector.
|
|
19
|
-
|
|
18
|
+
from unstructured_ingest.connector.astradb import (
|
|
19
|
+
AstraDBDestinationConnector,
|
|
20
20
|
)
|
|
21
21
|
|
|
22
|
-
return
|
|
22
|
+
return AstraDBDestinationConnector
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
from itertools import groupby
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def id_to_hash(element: dict, sequence_number: int) -> str:
|
|
6
|
+
"""Calculates and assigns a deterministic hash as an ID.
|
|
7
|
+
|
|
8
|
+
The hash ID is based on element's text, sequence number on page,
|
|
9
|
+
page number and its filename.
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
sequence_number: index on page
|
|
13
|
+
|
|
14
|
+
Returns: new ID value
|
|
15
|
+
"""
|
|
16
|
+
filename = element["metadata"].get("filename")
|
|
17
|
+
text = element["text"]
|
|
18
|
+
page_number = element["metadata"].get("page_number")
|
|
19
|
+
data = f"{filename}{text}{page_number}{sequence_number}"
|
|
20
|
+
element["element_id"] = hashlib.sha256(data.encode()).hexdigest()[:32]
|
|
21
|
+
return element["element_id"]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def assign_and_map_hash_ids(elements: list[dict]) -> list[dict]:
|
|
25
|
+
# -- generate sequence number for each element on a page --
|
|
26
|
+
elements = elements.copy()
|
|
27
|
+
page_numbers = [e["metadata"].get("page_number") for e in elements]
|
|
28
|
+
page_seq_pairs = [
|
|
29
|
+
seq_on_page for page, group in groupby(page_numbers) for seq_on_page, _ in enumerate(group)
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
# -- assign hash IDs to elements --
|
|
33
|
+
old_to_new_mapping = {
|
|
34
|
+
element["element_id"]: id_to_hash(element=element, sequence_number=seq_on_page_counter)
|
|
35
|
+
for element, seq_on_page_counter in zip(elements, page_seq_pairs)
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
# -- map old parent IDs to new ones --
|
|
39
|
+
for e in elements:
|
|
40
|
+
parent_id = e["metadata"].get("parent_id")
|
|
41
|
+
if not parent_id:
|
|
42
|
+
continue
|
|
43
|
+
e["metadata"]["parent_id"] = old_to_new_mapping[parent_id]
|
|
44
|
+
|
|
45
|
+
return elements
|
|
@@ -33,7 +33,7 @@ def requires_dependencies(
|
|
|
33
33
|
raise ImportError(
|
|
34
34
|
f"Following dependencies are missing: {', '.join(missing_deps)}. "
|
|
35
35
|
+ (
|
|
36
|
-
f"""Please install them using `pip install "unstructured[{extras}]"`."""
|
|
36
|
+
f"""Please install them using `pip install "unstructured-ingest[{extras}]"`.""" # noqa: E501
|
|
37
37
|
if extras
|
|
38
38
|
else f"Please install them using `pip install {' '.join(missing_deps)}`."
|
|
39
39
|
),
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
GOOGLE_DRIVE_EXPORT_TYPES = {
|
|
2
|
+
"application/vnd.google-apps.document": "application/"
|
|
3
|
+
"vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
4
|
+
"application/vnd.google-apps.spreadsheet": "application/"
|
|
5
|
+
"vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
6
|
+
"application/vnd.google-apps.presentation": "application/"
|
|
7
|
+
"vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
8
|
+
"application/vnd.google-apps.photo": "image/jpeg",
|
|
9
|
+
}
|
|
@@ -1,13 +1,15 @@
|
|
|
1
1
|
import inspect
|
|
2
2
|
from abc import ABC, abstractmethod
|
|
3
|
+
from collections import Counter
|
|
3
4
|
from dataclasses import dataclass, field, fields
|
|
4
5
|
from typing import Any, Optional, Type, TypeVar
|
|
5
6
|
|
|
6
7
|
import click
|
|
8
|
+
from pydantic import BaseModel
|
|
7
9
|
|
|
8
10
|
from unstructured_ingest.v2.cli.base.importer import import_from_string
|
|
9
|
-
from unstructured_ingest.v2.cli.
|
|
10
|
-
from unstructured_ingest.v2.cli.utils import
|
|
11
|
+
from unstructured_ingest.v2.cli.utils.click import extract_config
|
|
12
|
+
from unstructured_ingest.v2.cli.utils.model_conversion import options_from_base_model, post_check
|
|
11
13
|
from unstructured_ingest.v2.interfaces import ProcessorConfig
|
|
12
14
|
from unstructured_ingest.v2.logger import logger
|
|
13
15
|
from unstructured_ingest.v2.pipeline.pipeline import Pipeline
|
|
@@ -15,6 +17,7 @@ from unstructured_ingest.v2.processes.chunker import Chunker, ChunkerConfig
|
|
|
15
17
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
16
18
|
DownloaderT,
|
|
17
19
|
IndexerT,
|
|
20
|
+
RegistryEntry,
|
|
18
21
|
UploaderT,
|
|
19
22
|
UploadStager,
|
|
20
23
|
UploadStagerConfig,
|
|
@@ -24,6 +27,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
24
27
|
)
|
|
25
28
|
from unstructured_ingest.v2.processes.connectors.local import LocalUploader, LocalUploaderConfig
|
|
26
29
|
from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
|
|
30
|
+
from unstructured_ingest.v2.processes.filter import Filterer, FiltererConfig
|
|
27
31
|
from unstructured_ingest.v2.processes.partitioner import Partitioner, PartitionerConfig
|
|
28
32
|
|
|
29
33
|
CommandT = TypeVar("CommandT", bound=click.Command)
|
|
@@ -32,7 +36,52 @@ CommandT = TypeVar("CommandT", bound=click.Command)
|
|
|
32
36
|
@dataclass
|
|
33
37
|
class BaseCmd(ABC):
|
|
34
38
|
cmd_name: str
|
|
35
|
-
|
|
39
|
+
registry_entry: RegistryEntry
|
|
40
|
+
default_configs: list[Type[BaseModel]] = field(default_factory=list)
|
|
41
|
+
|
|
42
|
+
@abstractmethod
|
|
43
|
+
def get_registry_options(self):
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
def get_default_options(self) -> list[click.Option]:
|
|
47
|
+
options = []
|
|
48
|
+
for extra in self.default_configs:
|
|
49
|
+
options.extend(options_from_base_model(model=extra))
|
|
50
|
+
return options
|
|
51
|
+
|
|
52
|
+
@classmethod
|
|
53
|
+
def consolidate_options(cls, options: list[click.Option]) -> list[click.Option]:
|
|
54
|
+
option_names = [option.name for option in options]
|
|
55
|
+
duplicate_names = [name for name, count in Counter(option_names).items() if count > 1]
|
|
56
|
+
if not duplicate_names:
|
|
57
|
+
return options
|
|
58
|
+
consolidated_options = []
|
|
59
|
+
current_names = []
|
|
60
|
+
for option in options:
|
|
61
|
+
if option.name not in current_names:
|
|
62
|
+
current_names.append(option.name)
|
|
63
|
+
consolidated_options.append(option)
|
|
64
|
+
continue
|
|
65
|
+
existing_option = next(o for o in consolidated_options if o.name == option.name)
|
|
66
|
+
if existing_option.__dict__ == option.__dict__:
|
|
67
|
+
continue
|
|
68
|
+
option_diff = cls.get_options_diff(o1=option, o2=existing_option)
|
|
69
|
+
raise ValueError(
|
|
70
|
+
"Conflicting duplicate {} option defined: {}".format(
|
|
71
|
+
option.name, " | ".join([f"{d[0]}: {d[1]}" for d in option_diff])
|
|
72
|
+
)
|
|
73
|
+
)
|
|
74
|
+
return consolidated_options
|
|
75
|
+
|
|
76
|
+
@staticmethod
|
|
77
|
+
def get_options_diff(o1: click.Option, o2: click.Option):
|
|
78
|
+
o1_dict = o1.__dict__
|
|
79
|
+
o2_dict = o2.__dict__
|
|
80
|
+
for d in [o1_dict, o2_dict]:
|
|
81
|
+
d["opts"] = ",".join(d["opts"])
|
|
82
|
+
d["secondary_opts"] = ",".join(d["secondary_opts"])
|
|
83
|
+
option_diff = set(o1_dict.items()) ^ set(o2_dict.items())
|
|
84
|
+
return option_diff
|
|
36
85
|
|
|
37
86
|
@property
|
|
38
87
|
def cmd_name_key(self):
|
|
@@ -46,15 +95,11 @@ class BaseCmd(ABC):
|
|
|
46
95
|
def cmd(self, ctx: click.Context, **options) -> None:
|
|
47
96
|
pass
|
|
48
97
|
|
|
49
|
-
def add_options(self, cmd: CommandT
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
try:
|
|
55
|
-
config.add_cli_options(cmd=cmd)
|
|
56
|
-
except ValueError as e:
|
|
57
|
-
raise ValueError(f"failed to set configs from {config.__name__}: {e}")
|
|
98
|
+
def add_options(self, cmd: CommandT) -> CommandT:
|
|
99
|
+
options = self.get_registry_options()
|
|
100
|
+
options.extend(self.get_default_options())
|
|
101
|
+
post_check(options)
|
|
102
|
+
cmd.params.extend(options)
|
|
58
103
|
return cmd
|
|
59
104
|
|
|
60
105
|
def get_pipline(
|
|
@@ -75,6 +120,8 @@ class BaseCmd(ABC):
|
|
|
75
120
|
}
|
|
76
121
|
if chunker := self.get_chunker(options=source_options):
|
|
77
122
|
pipeline_kwargs["chunker"] = chunker
|
|
123
|
+
if filterer := self.get_filterer(options=source_options):
|
|
124
|
+
pipeline_kwargs["filterer"] = filterer
|
|
78
125
|
if embedder := self.get_embeder(options=source_options):
|
|
79
126
|
pipeline_kwargs["embedder"] = embedder
|
|
80
127
|
if dest:
|
|
@@ -105,6 +152,13 @@ class BaseCmd(ABC):
|
|
|
105
152
|
return None
|
|
106
153
|
return Chunker(config=chunker_config)
|
|
107
154
|
|
|
155
|
+
@staticmethod
|
|
156
|
+
def get_filterer(options: dict[str, Any]) -> Optional[Filterer]:
|
|
157
|
+
filterer_configs = extract_config(flat_data=options, config=FiltererConfig)
|
|
158
|
+
if not filterer_configs.dict():
|
|
159
|
+
return None
|
|
160
|
+
return Filterer(config=filterer_configs)
|
|
161
|
+
|
|
108
162
|
@staticmethod
|
|
109
163
|
def get_embeder(options: dict[str, Any]) -> Optional[Embedder]:
|
|
110
164
|
embedder_config = extract_config(flat_data=options, config=EmbedderConfig)
|
|
@@ -1,20 +1,34 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from dataclasses import dataclass
|
|
3
|
-
from typing import Optional, Type
|
|
4
3
|
|
|
5
4
|
import click
|
|
6
5
|
|
|
7
6
|
from unstructured_ingest.v2.cli.base.cmd import BaseCmd
|
|
8
|
-
from unstructured_ingest.v2.cli.
|
|
9
|
-
from unstructured_ingest.v2.cli.utils import
|
|
7
|
+
from unstructured_ingest.v2.cli.utils.click import Dict, conform_click_options
|
|
8
|
+
from unstructured_ingest.v2.cli.utils.model_conversion import options_from_base_model
|
|
10
9
|
from unstructured_ingest.v2.logger import logger
|
|
10
|
+
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
@dataclass
|
|
14
14
|
class DestCmd(BaseCmd):
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
15
|
+
registry_entry: DestinationRegistryEntry
|
|
16
|
+
|
|
17
|
+
def get_registry_options(self):
|
|
18
|
+
options = []
|
|
19
|
+
configs = [
|
|
20
|
+
config
|
|
21
|
+
for config in [
|
|
22
|
+
self.registry_entry.uploader_config,
|
|
23
|
+
self.registry_entry.upload_stager_config,
|
|
24
|
+
self.registry_entry.connection_config,
|
|
25
|
+
]
|
|
26
|
+
if config
|
|
27
|
+
]
|
|
28
|
+
for config in configs:
|
|
29
|
+
options.extend(options_from_base_model(model=config))
|
|
30
|
+
options = self.consolidate_options(options=options)
|
|
31
|
+
return options
|
|
18
32
|
|
|
19
33
|
def cmd(self, ctx: click.Context, **options) -> None:
|
|
20
34
|
logger.setLevel(logging.DEBUG if options.get("verbose", False) else logging.INFO)
|
|
@@ -47,12 +61,7 @@ class DestCmd(BaseCmd):
|
|
|
47
61
|
cmd.name = self.cli_cmd_name
|
|
48
62
|
cmd.short_help = "v2"
|
|
49
63
|
cmd.invoke_without_command = True
|
|
50
|
-
|
|
51
|
-
x
|
|
52
|
-
for x in [self.uploader_config, self.upload_stager_config, self.connection_config]
|
|
53
|
-
if x
|
|
54
|
-
]
|
|
55
|
-
self.add_options(cmd, extras=extras)
|
|
64
|
+
self.add_options(cmd)
|
|
56
65
|
cmd.params.append(
|
|
57
66
|
click.Option(
|
|
58
67
|
["--custom-stager"],
|
|
@@ -1,35 +1,52 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
|
-
from typing import Optional, Type
|
|
4
3
|
|
|
5
4
|
import click
|
|
5
|
+
from pydantic import BaseModel
|
|
6
6
|
|
|
7
7
|
from unstructured_ingest.v2.cli.base.cmd import BaseCmd
|
|
8
|
-
from unstructured_ingest.v2.cli.
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
PartitionerCliConfig,
|
|
12
|
-
ProcessorCliConfig,
|
|
13
|
-
)
|
|
14
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
15
|
-
from unstructured_ingest.v2.cli.utils import Group, conform_click_options
|
|
8
|
+
from unstructured_ingest.v2.cli.utils.click import Group, conform_click_options
|
|
9
|
+
from unstructured_ingest.v2.cli.utils.model_conversion import options_from_base_model
|
|
10
|
+
from unstructured_ingest.v2.interfaces import ProcessorConfig
|
|
16
11
|
from unstructured_ingest.v2.logger import logger
|
|
12
|
+
from unstructured_ingest.v2.processes import (
|
|
13
|
+
ChunkerConfig,
|
|
14
|
+
EmbedderConfig,
|
|
15
|
+
FiltererConfig,
|
|
16
|
+
PartitionerConfig,
|
|
17
|
+
)
|
|
18
|
+
from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
|
|
17
19
|
|
|
18
20
|
|
|
19
21
|
@dataclass
|
|
20
22
|
class SrcCmd(BaseCmd):
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
connection_config: Optional[Type[CliConfig]] = None
|
|
24
|
-
default_configs: list[CliConfig] = field(
|
|
23
|
+
registry_entry: SourceRegistryEntry
|
|
24
|
+
default_configs: list[BaseModel] = field(
|
|
25
25
|
default_factory=lambda: [
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
26
|
+
ProcessorConfig,
|
|
27
|
+
PartitionerConfig,
|
|
28
|
+
EmbedderConfig,
|
|
29
|
+
FiltererConfig,
|
|
30
|
+
ChunkerConfig,
|
|
30
31
|
]
|
|
31
32
|
)
|
|
32
33
|
|
|
34
|
+
def get_registry_options(self):
|
|
35
|
+
options = []
|
|
36
|
+
configs = [
|
|
37
|
+
config
|
|
38
|
+
for config in [
|
|
39
|
+
self.registry_entry.connection_config,
|
|
40
|
+
self.registry_entry.indexer_config,
|
|
41
|
+
self.registry_entry.downloader_config,
|
|
42
|
+
]
|
|
43
|
+
if config
|
|
44
|
+
]
|
|
45
|
+
for config in configs:
|
|
46
|
+
options.extend(options_from_base_model(model=config))
|
|
47
|
+
options = self.consolidate_options(options=options)
|
|
48
|
+
return options
|
|
49
|
+
|
|
33
50
|
def cmd(self, ctx: click.Context, **options) -> None:
|
|
34
51
|
if ctx.invoked_subcommand:
|
|
35
52
|
return
|
|
@@ -53,10 +70,7 @@ class SrcCmd(BaseCmd):
|
|
|
53
70
|
cmd.name = self.cli_cmd_name
|
|
54
71
|
cmd.short_help = "v2"
|
|
55
72
|
cmd.invoke_without_command = True
|
|
56
|
-
|
|
57
|
-
x for x in [self.indexer_config, self.downloader_config, self.connection_config] if x
|
|
58
|
-
]
|
|
59
|
-
self.add_options(cmd, extras=extras)
|
|
73
|
+
self.add_options(cmd)
|
|
60
74
|
|
|
61
75
|
# TODO remove after v1 no longer supported
|
|
62
76
|
cmd.params.append(
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import click
|
|
2
|
+
|
|
3
|
+
from unstructured_ingest.v2.cli.base import DestCmd, SrcCmd
|
|
4
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
5
|
+
destination_registry,
|
|
6
|
+
source_registry,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
src_cmds = [SrcCmd(cmd_name=k, registry_entry=v) for k, v in source_registry.items()]
|
|
10
|
+
dest_cmds = [DestCmd(cmd_name=k, registry_entry=v) for k, v in destination_registry.items()]
|
|
11
|
+
|
|
12
|
+
src: list[click.Group] = [v.get_cmd() for v in src_cmds]
|
|
13
|
+
|
|
14
|
+
dest: list[click.Command] = [v.get_cmd() for v in dest_cmds]
|