unstructured-ingest 0.0.14__py3-none-any.whl → 0.0.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/interfaces.py +1 -1
- unstructured_ingest/cli/utils.py +1 -1
- unstructured_ingest/connector/astradb.py +1 -1
- unstructured_ingest/connector/biomed.py +4 -4
- unstructured_ingest/connector/chroma.py +1 -1
- unstructured_ingest/connector/databricks_volumes.py +2 -2
- unstructured_ingest/connector/fsspec/box.py +1 -1
- unstructured_ingest/connector/fsspec/fsspec.py +5 -5
- unstructured_ingest/connector/git.py +1 -1
- unstructured_ingest/connector/google_drive.py +4 -4
- unstructured_ingest/connector/hubspot.py +1 -1
- unstructured_ingest/connector/kafka.py +8 -8
- unstructured_ingest/connector/local.py +1 -1
- unstructured_ingest/connector/notion/helpers.py +4 -4
- unstructured_ingest/connector/onedrive.py +3 -3
- unstructured_ingest/connector/outlook.py +2 -2
- unstructured_ingest/connector/pinecone.py +1 -1
- unstructured_ingest/connector/sharepoint.py +8 -8
- unstructured_ingest/connector/vectara.py +6 -6
- unstructured_ingest/interfaces.py +4 -4
- unstructured_ingest/logger.py +1 -1
- unstructured_ingest/pipeline/copy.py +1 -1
- unstructured_ingest/pipeline/interfaces.py +2 -2
- unstructured_ingest/pipeline/partition.py +1 -1
- unstructured_ingest/pipeline/pipeline.py +1 -1
- unstructured_ingest/pipeline/reformat/chunking.py +2 -2
- unstructured_ingest/pipeline/reformat/embedding.py +1 -1
- unstructured_ingest/pipeline/source.py +2 -2
- unstructured_ingest/utils/compression.py +3 -3
- unstructured_ingest/utils/string_and_date_utils.py +2 -2
- unstructured_ingest/v2/cli/base/cmd.py +3 -3
- unstructured_ingest/v2/cli/base/dest.py +1 -1
- unstructured_ingest/v2/cli/base/src.py +1 -1
- unstructured_ingest/v2/cli/utils/click.py +1 -1
- unstructured_ingest/v2/interfaces/processor.py +48 -13
- unstructured_ingest/v2/logger.py +1 -1
- unstructured_ingest/v2/otel.py +1 -1
- unstructured_ingest/v2/pipeline/interfaces.py +9 -2
- unstructured_ingest/v2/pipeline/pipeline.py +17 -6
- unstructured_ingest/v2/pipeline/steps/chunk.py +3 -3
- unstructured_ingest/v2/pipeline/steps/download.py +17 -2
- unstructured_ingest/v2/pipeline/steps/embed.py +3 -3
- unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
- unstructured_ingest/v2/pipeline/steps/index.py +2 -2
- unstructured_ingest/v2/pipeline/steps/partition.py +3 -3
- unstructured_ingest/v2/pipeline/steps/stage.py +1 -1
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
- unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +4 -4
- unstructured_ingest/v2/processes/connectors/google_drive.py +1 -1
- unstructured_ingest/v2/processes/connectors/local.py +6 -5
- unstructured_ingest/v2/processes/connectors/milvus.py +1 -1
- unstructured_ingest/v2/processes/connectors/onedrive.py +2 -2
- unstructured_ingest/v2/processes/connectors/opensearch.py +1 -1
- unstructured_ingest/v2/processes/connectors/pinecone.py +2 -2
- unstructured_ingest/v2/processes/connectors/sharepoint.py +9 -5
- unstructured_ingest/v2/processes/filter.py +1 -1
- unstructured_ingest/v2/processes/partitioner.py +3 -3
- unstructured_ingest/v2/utils.py +7 -0
- {unstructured_ingest-0.0.14.dist-info → unstructured_ingest-0.0.16.dist-info}/METADATA +272 -274
- {unstructured_ingest-0.0.14.dist-info → unstructured_ingest-0.0.16.dist-info}/RECORD +69 -69
- unstructured_ingest/evaluate.py +0 -338
- {unstructured_ingest-0.0.14.dist-info → unstructured_ingest-0.0.16.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.0.14.dist-info → unstructured_ingest-0.0.16.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.14.dist-info → unstructured_ingest-0.0.16.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.14.dist-info → unstructured_ingest-0.0.16.dist-info}/top_level.txt +0 -0
|
@@ -10,13 +10,13 @@ def json_to_dict(json_string: str) -> t.Union[str, t.Dict[str, t.Any]]:
|
|
|
10
10
|
try:
|
|
11
11
|
return json.loads(json_string)
|
|
12
12
|
except json.JSONDecodeError:
|
|
13
|
-
# Not
|
|
13
|
+
# Not necessary an error if it is a path or malformed json
|
|
14
14
|
pass
|
|
15
15
|
try:
|
|
16
16
|
# This is common when single quotes are used instead of double quotes
|
|
17
17
|
return json.loads(json_string.replace("'", '"'))
|
|
18
18
|
except json.JSONDecodeError:
|
|
19
|
-
# Not
|
|
19
|
+
# Not necessary an error if it is a path
|
|
20
20
|
pass
|
|
21
21
|
return json_string
|
|
22
22
|
|
|
@@ -102,7 +102,7 @@ class BaseCmd(ABC):
|
|
|
102
102
|
cmd.params.extend(options)
|
|
103
103
|
return cmd
|
|
104
104
|
|
|
105
|
-
def
|
|
105
|
+
def get_pipeline(
|
|
106
106
|
self,
|
|
107
107
|
src: str,
|
|
108
108
|
source_options: dict[str, Any],
|
|
@@ -122,7 +122,7 @@ class BaseCmd(ABC):
|
|
|
122
122
|
pipeline_kwargs["chunker"] = chunker
|
|
123
123
|
if filterer := self.get_filterer(options=source_options):
|
|
124
124
|
pipeline_kwargs["filterer"] = filterer
|
|
125
|
-
if embedder := self.
|
|
125
|
+
if embedder := self.get_embedder(options=source_options):
|
|
126
126
|
pipeline_kwargs["embedder"] = embedder
|
|
127
127
|
if dest:
|
|
128
128
|
logger.debug(
|
|
@@ -160,7 +160,7 @@ class BaseCmd(ABC):
|
|
|
160
160
|
return Filterer(config=filterer_configs)
|
|
161
161
|
|
|
162
162
|
@staticmethod
|
|
163
|
-
def
|
|
163
|
+
def get_embedder(options: dict[str, Any]) -> Optional[Embedder]:
|
|
164
164
|
embedder_config = extract_config(flat_data=options, config=EmbedderConfig)
|
|
165
165
|
if not embedder_config.embedding_provider:
|
|
166
166
|
return None
|
|
@@ -40,7 +40,7 @@ class DestCmd(BaseCmd):
|
|
|
40
40
|
source_options: dict = ctx.parent.params if ctx.parent else {}
|
|
41
41
|
conform_click_options(options)
|
|
42
42
|
try:
|
|
43
|
-
pipeline = self.
|
|
43
|
+
pipeline = self.get_pipeline(
|
|
44
44
|
src=source_cmd,
|
|
45
45
|
source_options=source_options,
|
|
46
46
|
dest=self.cmd_name,
|
|
@@ -55,7 +55,7 @@ class SrcCmd(BaseCmd):
|
|
|
55
55
|
conform_click_options(options)
|
|
56
56
|
logger.setLevel(logging.DEBUG if options.get("verbose", False) else logging.INFO)
|
|
57
57
|
try:
|
|
58
|
-
pipeline = self.
|
|
58
|
+
pipeline = self.get_pipeline(src=self.cmd_name, source_options=options)
|
|
59
59
|
pipeline.run()
|
|
60
60
|
except Exception as e:
|
|
61
61
|
logger.error(f"failed to run source command {self.cmd_name}: {e}", exc_info=True)
|
|
@@ -11,21 +11,56 @@ DEFAULT_WORK_DIR = str((Path.home() / ".cache" / "unstructured" / "ingest" / "pi
|
|
|
11
11
|
class ProcessorConfig(BaseModel):
|
|
12
12
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
13
13
|
|
|
14
|
-
reprocess: bool =
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
14
|
+
reprocess: bool = Field(
|
|
15
|
+
default=False,
|
|
16
|
+
description="Reprocess a downloaded file even if the relevant structured "
|
|
17
|
+
"output .json file in output directory already exists.",
|
|
18
|
+
)
|
|
19
|
+
verbose: bool = Field(default=False)
|
|
20
|
+
tqdm: bool = Field(default=False, description="Display tqdm progress bar")
|
|
21
|
+
work_dir: str = Field(
|
|
22
|
+
default_factory=lambda: DEFAULT_WORK_DIR,
|
|
23
|
+
description="Where to place working files when processing each step",
|
|
24
|
+
)
|
|
25
|
+
num_processes: int = Field(
|
|
26
|
+
default=2, description="Number of parallel processes with which to process docs"
|
|
27
|
+
)
|
|
28
|
+
max_connections: Optional[int] = Field(
|
|
29
|
+
default=None, description="Limit of concurrent connectionts"
|
|
30
|
+
)
|
|
31
|
+
raise_on_error: bool = Field(
|
|
32
|
+
default=False,
|
|
33
|
+
description="Is set, will raise error if any doc in the pipeline fail. "
|
|
34
|
+
"Otherwise will log error and continue with other docs",
|
|
35
|
+
)
|
|
21
36
|
disable_parallelism: bool = Field(
|
|
22
|
-
default_factory=lambda: os.getenv("INGEST_DISABLE_PARALLELISM", "false").lower() == "true"
|
|
37
|
+
default_factory=lambda: os.getenv("INGEST_DISABLE_PARALLELISM", "false").lower() == "true",
|
|
38
|
+
)
|
|
39
|
+
preserve_downloads: bool = Field(
|
|
40
|
+
default=False, description="Don't delete downloaded files after process completes"
|
|
41
|
+
)
|
|
42
|
+
download_only: bool = Field(
|
|
43
|
+
default=False, description="skip the rest of the process after files are downloaded"
|
|
44
|
+
)
|
|
45
|
+
re_download: bool = Field(
|
|
46
|
+
default=False,
|
|
47
|
+
description="If set, will re-download downloaded files "
|
|
48
|
+
"regardless of if they already exist locally",
|
|
49
|
+
)
|
|
50
|
+
uncompress: bool = Field(
|
|
51
|
+
default=False,
|
|
52
|
+
description="Uncompress any archived files. Currently supporting "
|
|
53
|
+
"zip and tar files based on file extension.",
|
|
54
|
+
)
|
|
55
|
+
iter_delete: bool = Field(
|
|
56
|
+
default=False,
|
|
57
|
+
description="If limited on memory, this can be enabled to delete "
|
|
58
|
+
"cached content as it's used and no longer needed in the pipeline.",
|
|
59
|
+
)
|
|
60
|
+
delete_cache: bool = Field(
|
|
61
|
+
default=False,
|
|
62
|
+
description="If set, will delete the cache work directory when process finishes",
|
|
23
63
|
)
|
|
24
|
-
preserve_downloads: bool = False
|
|
25
|
-
download_only: bool = False
|
|
26
|
-
max_docs: Optional[int] = None
|
|
27
|
-
re_download: bool = False
|
|
28
|
-
uncompress: bool = False
|
|
29
64
|
|
|
30
65
|
# OTEL support
|
|
31
66
|
otel_endpoint: Optional[str] = Field(
|
unstructured_ingest/v2/logger.py
CHANGED
|
@@ -101,7 +101,7 @@ class SensitiveFormatter(Formatter):
|
|
|
101
101
|
|
|
102
102
|
|
|
103
103
|
def remove_root_handlers(logger: Logger) -> None:
|
|
104
|
-
# NOTE(robinson)
|
|
104
|
+
# NOTE(robinson): in some environments such as Google Colab, there is a root handler
|
|
105
105
|
# that doesn't not mask secrets, meaning sensitive info such as api keys appear in logs.
|
|
106
106
|
# Removing these when they exist prevents this behavior
|
|
107
107
|
if logger.root.hasHandlers():
|
unstructured_ingest/v2/otel.py
CHANGED
|
@@ -92,7 +92,7 @@ class OtelHandler:
|
|
|
92
92
|
return None
|
|
93
93
|
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
|
94
94
|
|
|
95
|
-
logger.debug(f"
|
|
95
|
+
logger.debug(f"adding otel exported at {otel_endpoint}")
|
|
96
96
|
trace_exporter = OTLPSpanExporter()
|
|
97
97
|
processor = SimpleSpanProcessor(trace_exporter)
|
|
98
98
|
provider.add_span_processor(processor)
|
|
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import logging
|
|
5
5
|
import multiprocessing as mp
|
|
6
|
+
import shutil
|
|
6
7
|
from abc import ABC, abstractmethod
|
|
7
8
|
from concurrent.futures import ThreadPoolExecutor
|
|
8
9
|
from dataclasses import dataclass
|
|
@@ -118,10 +119,10 @@ class PipelineStep(ABC):
|
|
|
118
119
|
iterable = iterable or []
|
|
119
120
|
if iterable:
|
|
120
121
|
logger.info(
|
|
121
|
-
f"
|
|
122
|
+
f"calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore
|
|
122
123
|
)
|
|
123
124
|
else:
|
|
124
|
-
logger.info(f"
|
|
125
|
+
logger.info(f"calling {self.__class__.__name__} with no inputs")
|
|
125
126
|
if self.context.async_supported and self.process.is_async():
|
|
126
127
|
return self.process_async(iterable=iterable)
|
|
127
128
|
if self.context.mp_supported:
|
|
@@ -180,6 +181,12 @@ class PipelineStep(ABC):
|
|
|
180
181
|
def cache_dir(self) -> Path:
|
|
181
182
|
return Path(self.context.work_dir) / self.identifier
|
|
182
183
|
|
|
184
|
+
def delete_cache(self):
|
|
185
|
+
if self.context.iter_delete and self.cache_dir.exists():
|
|
186
|
+
cache_dir = self.cache_dir
|
|
187
|
+
logger.info(f"deleting {self.identifier} cache dir {cache_dir}")
|
|
188
|
+
shutil.rmtree(cache_dir)
|
|
189
|
+
|
|
183
190
|
|
|
184
191
|
@dataclass
|
|
185
192
|
class BatchPipelineStep(PipelineStep, ABC):
|
|
@@ -2,7 +2,9 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
import multiprocessing as mp
|
|
5
|
+
import shutil
|
|
5
6
|
from dataclasses import InitVar, dataclass, field
|
|
7
|
+
from pathlib import Path
|
|
6
8
|
from typing import Any
|
|
7
9
|
|
|
8
10
|
from unstructured_ingest.v2.interfaces import ProcessorConfig, Uploader
|
|
@@ -115,7 +117,9 @@ class Pipeline:
|
|
|
115
117
|
)
|
|
116
118
|
|
|
117
119
|
def cleanup(self):
|
|
118
|
-
|
|
120
|
+
if self.context.delete_cache and Path(self.context.work_dir).exists():
|
|
121
|
+
logger.info(f"deleting cache directory: {self.context.work_dir}")
|
|
122
|
+
shutil.rmtree(self.context.work_dir)
|
|
119
123
|
|
|
120
124
|
def log_statuses(self):
|
|
121
125
|
if status := self.context.status:
|
|
@@ -183,7 +187,7 @@ class Pipeline:
|
|
|
183
187
|
return filtered_records
|
|
184
188
|
|
|
185
189
|
def _run(self):
|
|
186
|
-
logger.info(f"
|
|
190
|
+
logger.info(f"running local pipeline: {self} with configs: " f"{self.context.json()}")
|
|
187
191
|
if self.context.mp_supported:
|
|
188
192
|
manager = mp.Manager()
|
|
189
193
|
self.context.status = manager.dict()
|
|
@@ -228,26 +232,33 @@ class Pipeline:
|
|
|
228
232
|
logger.info("No files to process after filtering uncompressed content, exiting")
|
|
229
233
|
return
|
|
230
234
|
|
|
231
|
-
if not downloaded_data:
|
|
235
|
+
if not downloaded_data or self.context.download_only:
|
|
232
236
|
return
|
|
233
237
|
|
|
234
238
|
# Partition content
|
|
235
239
|
elements = self.partitioner_step(downloaded_data)
|
|
240
|
+
# Download data non longer needed, delete if possible
|
|
241
|
+
self.downloader_step.delete_cache()
|
|
236
242
|
elements = self.clean_results(results=elements)
|
|
237
243
|
if not elements:
|
|
238
244
|
logger.info("No files to process after partitioning, exiting")
|
|
239
245
|
return
|
|
240
246
|
|
|
241
247
|
# Run element specific modifiers
|
|
242
|
-
|
|
243
|
-
|
|
248
|
+
last_step = self.partitioner_step
|
|
249
|
+
for step in [s for s in [self.chunker_step, self.embedder_step, self.stager_step] if s]:
|
|
250
|
+
elements = step(elements)
|
|
244
251
|
elements = self.clean_results(results=elements)
|
|
252
|
+
# Delete data from previous step if possible since no longer needed
|
|
253
|
+
last_step.delete_cache()
|
|
254
|
+
last_step = step
|
|
245
255
|
if not elements:
|
|
246
|
-
logger.info(f"
|
|
256
|
+
logger.info(f"no files to process after {step.__class__.__name__}, exiting")
|
|
247
257
|
return
|
|
248
258
|
|
|
249
259
|
# Upload the final result
|
|
250
260
|
self.uploader_step(iterable=elements)
|
|
261
|
+
last_step.delete_cache()
|
|
251
262
|
|
|
252
263
|
def __str__(self):
|
|
253
264
|
s = [str(self.indexer_step)]
|
|
@@ -29,7 +29,7 @@ class ChunkStep(PipelineStep):
|
|
|
29
29
|
|
|
30
30
|
def __post_init__(self):
|
|
31
31
|
config = self.process.config.json() if self.process.config else None
|
|
32
|
-
logger.info(f"
|
|
32
|
+
logger.info(f"created {self.identifier} with configs: {config}")
|
|
33
33
|
|
|
34
34
|
def should_chunk(self, filepath: Path, file_data: FileData) -> bool:
|
|
35
35
|
if self.context.reprocess or file_data.reprocess:
|
|
@@ -44,7 +44,7 @@ class ChunkStep(PipelineStep):
|
|
|
44
44
|
|
|
45
45
|
def _save_output(self, output_filepath: str, chunked_content: list[dict]):
|
|
46
46
|
with open(str(output_filepath), "w") as f:
|
|
47
|
-
logger.debug(f"
|
|
47
|
+
logger.debug(f"writing chunker output to: {output_filepath}")
|
|
48
48
|
json.dump(chunked_content, f, indent=2)
|
|
49
49
|
|
|
50
50
|
async def _run_async(
|
|
@@ -54,7 +54,7 @@ class ChunkStep(PipelineStep):
|
|
|
54
54
|
file_data = FileData.from_file(path=file_data_path)
|
|
55
55
|
output_filepath = self.get_output_filepath(filename=path)
|
|
56
56
|
if not self.should_chunk(filepath=output_filepath, file_data=file_data):
|
|
57
|
-
logger.debug(f"
|
|
57
|
+
logger.debug(f"skipping chunking, output already exists: {output_filepath}")
|
|
58
58
|
return ChunkStepResponse(file_data_path=file_data_path, path=str(output_filepath))
|
|
59
59
|
fn_kwargs = {"elements_filepath": path}
|
|
60
60
|
if not asyncio.iscoroutinefunction(fn):
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import hashlib
|
|
3
3
|
import json
|
|
4
|
+
import shutil
|
|
4
5
|
from dataclasses import dataclass
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from typing import Callable, Optional, TypedDict, TypeVar
|
|
@@ -82,7 +83,7 @@ class DownloadStep(PipelineStep):
|
|
|
82
83
|
f"match size of local file: {file_size_bytes}, updating"
|
|
83
84
|
)
|
|
84
85
|
file_data.metadata.filesize_bytes = file_size_bytes
|
|
85
|
-
logger.debug(f"
|
|
86
|
+
logger.debug(f"updating file data with new content: {file_data.to_dict()}")
|
|
86
87
|
with file_data_path.open("w") as file:
|
|
87
88
|
json.dump(file_data.to_dict(), file, indent=2)
|
|
88
89
|
|
|
@@ -90,7 +91,7 @@ class DownloadStep(PipelineStep):
|
|
|
90
91
|
file_data = FileData.from_file(path=file_data_path)
|
|
91
92
|
download_path = self.process.get_download_path(file_data=file_data)
|
|
92
93
|
if not self.should_download(file_data=file_data, file_data_path=file_data_path):
|
|
93
|
-
logger.debug(f"
|
|
94
|
+
logger.debug(f"skipping download, file already exists locally: {download_path}")
|
|
94
95
|
self.update_file_data(
|
|
95
96
|
file_data=file_data,
|
|
96
97
|
file_data_path=Path(file_data_path),
|
|
@@ -185,3 +186,17 @@ class DownloadStep(PipelineStep):
|
|
|
185
186
|
if extras:
|
|
186
187
|
hashable_string += "".join(extras)
|
|
187
188
|
return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
|
|
189
|
+
|
|
190
|
+
@property
|
|
191
|
+
def cache_dir(self) -> Path:
|
|
192
|
+
return self.process.download_config.download_dir
|
|
193
|
+
|
|
194
|
+
def delete_cache(self):
|
|
195
|
+
if (
|
|
196
|
+
self.context.iter_delete
|
|
197
|
+
and not self.context.preserve_downloads
|
|
198
|
+
and self.cache_dir.exists()
|
|
199
|
+
):
|
|
200
|
+
cache_dir = self.cache_dir
|
|
201
|
+
logger.info(f"deleting {self.identifier} cache dir {cache_dir}")
|
|
202
|
+
shutil.rmtree(cache_dir)
|
|
@@ -29,7 +29,7 @@ class EmbedStep(PipelineStep):
|
|
|
29
29
|
|
|
30
30
|
def __post_init__(self):
|
|
31
31
|
config = self.process.config.json() if self.process.config else None
|
|
32
|
-
logger.info(f"
|
|
32
|
+
logger.info(f"created {self.identifier} with configs: {config}")
|
|
33
33
|
|
|
34
34
|
def should_embed(self, filepath: Path, file_data: FileData) -> bool:
|
|
35
35
|
if self.context.reprocess or file_data.reprocess:
|
|
@@ -44,7 +44,7 @@ class EmbedStep(PipelineStep):
|
|
|
44
44
|
|
|
45
45
|
def _save_output(self, output_filepath: str, embedded_content: list[dict]):
|
|
46
46
|
with open(str(output_filepath), "w") as f:
|
|
47
|
-
logger.debug(f"
|
|
47
|
+
logger.debug(f"writing embedded output to: {output_filepath}")
|
|
48
48
|
json.dump(embedded_content, f, indent=2)
|
|
49
49
|
|
|
50
50
|
async def _run_async(self, fn: Callable, path: str, file_data_path: str) -> EmbedStepResponse:
|
|
@@ -52,7 +52,7 @@ class EmbedStep(PipelineStep):
|
|
|
52
52
|
file_data = FileData.from_file(path=file_data_path)
|
|
53
53
|
output_filepath = self.get_output_filepath(filename=path)
|
|
54
54
|
if not self.should_embed(filepath=output_filepath, file_data=file_data):
|
|
55
|
-
logger.debug(f"
|
|
55
|
+
logger.debug(f"skipping embedding, output already exists: {output_filepath}")
|
|
56
56
|
return EmbedStepResponse(file_data_path=file_data_path, path=str(output_filepath))
|
|
57
57
|
fn_kwargs = {"elements_filepath": path}
|
|
58
58
|
if not asyncio.iscoroutinefunction(fn):
|
|
@@ -17,7 +17,7 @@ class FilterStep(PipelineStep):
|
|
|
17
17
|
|
|
18
18
|
def __post_init__(self):
|
|
19
19
|
config = self.process.config.json() if self.process.config else None
|
|
20
|
-
logger.info(f"
|
|
20
|
+
logger.info(f"created {self.identifier} with configs: {config}")
|
|
21
21
|
|
|
22
22
|
async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:
|
|
23
23
|
file_data = FileData.from_file(path=file_data_path)
|
|
@@ -28,14 +28,14 @@ class IndexStep(PipelineStep):
|
|
|
28
28
|
self.process.connection_config.json() if self.process.connection_config else None
|
|
29
29
|
)
|
|
30
30
|
logger.info(
|
|
31
|
-
f"
|
|
31
|
+
f"created {self.identifier} with configs: {config}, "
|
|
32
32
|
f"connection configs: {connection_config}"
|
|
33
33
|
)
|
|
34
34
|
|
|
35
35
|
@instrument(span_name=STEP_ID)
|
|
36
36
|
def run(self) -> Generator[str, None, None]:
|
|
37
37
|
for file_data in self.process.run():
|
|
38
|
-
logger.debug(f"
|
|
38
|
+
logger.debug(f"generated file data: {file_data.to_dict()}")
|
|
39
39
|
try:
|
|
40
40
|
record_hash = self.get_hash(extras=[file_data.identifier])
|
|
41
41
|
filename = f"{record_hash}.json"
|
|
@@ -29,7 +29,7 @@ class PartitionStep(PipelineStep):
|
|
|
29
29
|
|
|
30
30
|
def __post_init__(self):
|
|
31
31
|
config = self.process.config.json()
|
|
32
|
-
logger.info(f"
|
|
32
|
+
logger.info(f"created {self.identifier} with configs: {config}")
|
|
33
33
|
|
|
34
34
|
def should_partition(self, filepath: Path, file_data: FileData) -> bool:
|
|
35
35
|
if self.context.reprocess or file_data.reprocess:
|
|
@@ -44,7 +44,7 @@ class PartitionStep(PipelineStep):
|
|
|
44
44
|
|
|
45
45
|
def _save_output(self, output_filepath: str, partitioned_content: list[dict]):
|
|
46
46
|
with open(str(output_filepath), "w") as f:
|
|
47
|
-
logger.debug(f"
|
|
47
|
+
logger.debug(f"writing partitioned output to: {output_filepath}")
|
|
48
48
|
json.dump(partitioned_content, f, indent=2)
|
|
49
49
|
|
|
50
50
|
async def _run_async(
|
|
@@ -54,7 +54,7 @@ class PartitionStep(PipelineStep):
|
|
|
54
54
|
file_data = FileData.from_file(path=file_data_path)
|
|
55
55
|
output_filepath = self.get_output_filepath(filename=Path(file_data_path))
|
|
56
56
|
if not self.should_partition(filepath=output_filepath, file_data=file_data):
|
|
57
|
-
logger.debug(f"
|
|
57
|
+
logger.debug(f"skipping partitioning, output already exists: {output_filepath}")
|
|
58
58
|
return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
|
|
59
59
|
fn_kwargs = {"filename": path, "metadata": file_data.metadata.to_dict()}
|
|
60
60
|
if not asyncio.iscoroutinefunction(fn):
|
|
@@ -31,7 +31,7 @@ class UploadStageStep(PipelineStep):
|
|
|
31
31
|
self.process.upload_stager_config.json() if self.process.upload_stager_config else None
|
|
32
32
|
)
|
|
33
33
|
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
34
|
-
logger.info(f"
|
|
34
|
+
logger.info(f"created {self.identifier} with configs: {config}")
|
|
35
35
|
|
|
36
36
|
async def _run_async(
|
|
37
37
|
self, fn: Callable, path: str, file_data_path: str
|
|
@@ -23,7 +23,7 @@ class UncompressStep(PipelineStep):
|
|
|
23
23
|
|
|
24
24
|
def __post_init__(self):
|
|
25
25
|
config = self.process.config.json() if self.process.config else None
|
|
26
|
-
logger.info(f"
|
|
26
|
+
logger.info(f"created {self.identifier} with configs: {config}")
|
|
27
27
|
|
|
28
28
|
async def _run_async(
|
|
29
29
|
self, fn: Callable, path: str, file_data_path: str
|
|
@@ -6,6 +6,8 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
6
6
|
add_source_entry,
|
|
7
7
|
)
|
|
8
8
|
|
|
9
|
+
from .airtable import CONNECTOR_TYPE as AIRTABLE_CONNECTOR_TYPE
|
|
10
|
+
from .airtable import airtable_source_entry
|
|
9
11
|
from .astradb import CONNECTOR_TYPE as ASTRA_DB_CONNECTOR_TYPE
|
|
10
12
|
from .astradb import astra_db_destination_entry
|
|
11
13
|
from .azure_cognitive_search import CONNECTOR_TYPE as AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE
|
|
@@ -92,3 +94,4 @@ add_destination_entry(
|
|
|
92
94
|
)
|
|
93
95
|
|
|
94
96
|
add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destination_entry)
|
|
97
|
+
add_source_entry(source_type=AIRTABLE_CONNECTOR_TYPE, entry=airtable_source_entry)
|