unstructured-ingest 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (38) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/v2/interfaces/processor.py +6 -1
  3. unstructured_ingest/v2/interfaces/uploader.py +9 -4
  4. unstructured_ingest/v2/otel.py +111 -0
  5. unstructured_ingest/v2/pipeline/interfaces.py +61 -28
  6. unstructured_ingest/v2/pipeline/otel.py +32 -0
  7. unstructured_ingest/v2/pipeline/pipeline.py +11 -7
  8. unstructured_ingest/v2/pipeline/steps/index.py +2 -0
  9. unstructured_ingest/v2/pipeline/steps/upload.py +7 -19
  10. unstructured_ingest/v2/processes/connectors/astradb.py +3 -8
  11. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +4 -9
  12. unstructured_ingest/v2/processes/connectors/chroma.py +3 -8
  13. unstructured_ingest/v2/processes/connectors/couchbase.py +5 -9
  14. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +9 -10
  15. unstructured_ingest/v2/processes/connectors/elasticsearch.py +4 -7
  16. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +3 -3
  17. unstructured_ingest/v2/processes/connectors/fsspec/box.py +3 -3
  18. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +3 -3
  19. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +4 -6
  20. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +3 -3
  21. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +2 -3
  22. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +3 -3
  23. unstructured_ingest/v2/processes/connectors/kdbai.py +7 -8
  24. unstructured_ingest/v2/processes/connectors/local.py +15 -22
  25. unstructured_ingest/v2/processes/connectors/milvus.py +32 -27
  26. unstructured_ingest/v2/processes/connectors/mongodb.py +3 -8
  27. unstructured_ingest/v2/processes/connectors/pinecone.py +6 -24
  28. unstructured_ingest/v2/processes/connectors/singlestore.py +6 -6
  29. unstructured_ingest/v2/processes/connectors/sql.py +5 -7
  30. unstructured_ingest/v2/processes/connectors/weaviate.py +4 -11
  31. unstructured_ingest/v2/processes/partitioner.py +8 -1
  32. {unstructured_ingest-0.0.6.dist-info → unstructured_ingest-0.0.8.dist-info}/METADATA +262 -198
  33. {unstructured_ingest-0.0.6.dist-info → unstructured_ingest-0.0.8.dist-info}/RECORD +37 -36
  34. unstructured_ingest/v2/example.py +0 -37
  35. {unstructured_ingest-0.0.6.dist-info → unstructured_ingest-0.0.8.dist-info}/LICENSE.md +0 -0
  36. {unstructured_ingest-0.0.6.dist-info → unstructured_ingest-0.0.8.dist-info}/WHEEL +0 -0
  37. {unstructured_ingest-0.0.6.dist-info → unstructured_ingest-0.0.8.dist-info}/entry_points.txt +0 -0
  38. {unstructured_ingest-0.0.6.dist-info → unstructured_ingest-0.0.8.dist-info}/top_level.txt +0 -0
@@ -1 +1 @@
1
- __version__ = "0.0.6" # pragma: no cover
1
+ __version__ = "0.0.8" # pragma: no cover
@@ -27,9 +27,14 @@ class ProcessorConfig(BaseModel):
27
27
  re_download: bool = False
28
28
  uncompress: bool = False
29
29
 
30
+ # OTEL support
31
+ otel_endpoint: Optional[str] = Field(
32
+ default=None, description="OTEL endpoint to publish trace data to"
33
+ )
34
+
30
35
  # Used to keep track of state in pipeline
31
36
  status: dict = Field(default_factory=dict)
32
- semaphore: Optional[Semaphore] = Field(init=False, default=None)
37
+ semaphore: Optional[Semaphore] = Field(init=False, default=None, exclude=True)
33
38
 
34
39
  def model_post_init(self, __context: Any) -> None:
35
40
  if self.max_connections is not None:
@@ -1,4 +1,4 @@
1
- from abc import ABC, abstractmethod
1
+ from abc import ABC
2
2
  from dataclasses import dataclass
3
3
  from pathlib import Path
4
4
  from typing import Any, TypeVar
@@ -31,9 +31,14 @@ class Uploader(BaseProcess, BaseConnector, ABC):
31
31
  def is_async(self) -> bool:
32
32
  return False
33
33
 
34
- @abstractmethod
35
- def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
36
- pass
34
+ def is_batch(self) -> bool:
35
+ return False
36
+
37
+ def run_batch(self, contents: list[UploadContent], **kwargs: Any) -> None:
38
+ raise NotImplementedError()
39
+
40
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
41
+ raise NotImplementedError()
37
42
 
38
43
  async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
39
44
  return self.run(contents=[UploadContent(path=path, file_data=file_data)], **kwargs)
@@ -0,0 +1,111 @@
1
+ import os
2
+ from dataclasses import dataclass, field
3
+ from typing import Callable, ClassVar, Optional, Protocol, Sequence
4
+
5
+ from opentelemetry import trace
6
+ from opentelemetry.context import attach, get_current
7
+ from opentelemetry.propagate import extract, inject
8
+ from opentelemetry.sdk.resources import SERVICE_NAME, Resource
9
+ from opentelemetry.sdk.trace import ReadableSpan, Tracer, TracerProvider
10
+ from opentelemetry.sdk.trace.export import (
11
+ ConsoleSpanExporter,
12
+ SimpleSpanProcessor,
13
+ SpanExportResult,
14
+ )
15
+
16
+ from unstructured_ingest.v2.logger import logger
17
+
18
+
19
+ class AddTraceCallable(Protocol):
20
+ def __call__(self, provider: TracerProvider) -> None:
21
+ pass
22
+
23
+
24
+ class LogSpanExporter(ConsoleSpanExporter):
25
+ def __init__(self, log_out: Callable = logger.info, **kwargs):
26
+ self.log_out = log_out
27
+ super().__init__(**kwargs)
28
+
29
+ def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult:
30
+ for span in spans:
31
+ self.log_out(self.formatter(span))
32
+ return SpanExportResult.SUCCESS
33
+
34
+
35
+ @dataclass
36
+ class OtelHandler:
37
+ otel_endpoint: Optional[str] = None
38
+ service_name: str = "unstructured-ingest"
39
+ trace_provider: TracerProvider = field(init=False)
40
+ log_out: Callable = field(default=logger.info)
41
+ trace_context_key: ClassVar[str] = "_trace_context"
42
+
43
+ def init_trace(self):
44
+ # Should only be done once
45
+ resource = Resource(attributes={SERVICE_NAME: self.service_name})
46
+ trace_provider = self.init_trace_provider(resource=resource)
47
+ trace.set_tracer_provider(trace_provider)
48
+
49
+ @staticmethod
50
+ def set_attributes(span, attributes_dict):
51
+ if attributes_dict:
52
+ for att in attributes_dict:
53
+ span.set_attribute(att, attributes_dict[att])
54
+
55
+ @staticmethod
56
+ def inject_context() -> dict:
57
+ trace_context = {}
58
+ current_context = get_current()
59
+ inject(trace_context, current_context)
60
+ return trace_context
61
+
62
+ @staticmethod
63
+ def attach_context(trace_context: dict) -> object:
64
+ extracted_context = extract(trace_context)
65
+ return attach(extracted_context)
66
+
67
+ def get_otel_endpoint(self) -> Optional[str]:
68
+ if otel_endpoint := self.otel_endpoint:
69
+ return otel_endpoint
70
+ if otlp_endpoint := os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT"):
71
+ return otlp_endpoint
72
+ if otlp_traces_endpoint := os.getenv("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"):
73
+ return otlp_traces_endpoint
74
+ return None
75
+
76
+ def _add_console_trace_processor(self, provider: TracerProvider) -> None:
77
+ def custom_formatter(span: ReadableSpan) -> str:
78
+ duration = (span.end_time - span.start_time) / 1e9
79
+ s = f"{span.name} finished in {duration}s"
80
+ if span.attributes:
81
+ attributes_str = ", ".join([f"{k}={v}" for k, v in span.attributes.items()])
82
+ s += f", attributes: {attributes_str}"
83
+ return s
84
+
85
+ tracer_exporter = LogSpanExporter(formatter=custom_formatter, log_out=self.log_out)
86
+ processor = SimpleSpanProcessor(tracer_exporter)
87
+ provider.add_span_processor(span_processor=processor)
88
+
89
+ def _add_otel_trace_processor(self, provider: TracerProvider) -> None:
90
+ otel_endpoint = self.get_otel_endpoint()
91
+ if not otel_endpoint:
92
+ return None
93
+ from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
94
+
95
+ logger.debug(f"Adding otel exported at {otel_endpoint}")
96
+ trace_exporter = OTLPSpanExporter()
97
+ processor = SimpleSpanProcessor(trace_exporter)
98
+ provider.add_span_processor(processor)
99
+
100
+ def init_trace_provider(self, resource: Resource) -> TracerProvider:
101
+ trace_provider = TracerProvider(resource=resource)
102
+ add_fns: list[AddTraceCallable] = [
103
+ self._add_otel_trace_processor,
104
+ self._add_console_trace_processor,
105
+ ]
106
+ for add_fn in add_fns:
107
+ add_fn(provider=trace_provider)
108
+ return trace_provider
109
+
110
+ def get_tracer(self) -> Tracer:
111
+ return trace.get_tracer(self.service_name)
@@ -1,40 +1,24 @@
1
1
  import asyncio
2
2
  import logging
3
3
  import multiprocessing as mp
4
- from abc import ABC
4
+ from abc import ABC, abstractmethod
5
5
  from concurrent.futures import ThreadPoolExecutor
6
6
  from dataclasses import dataclass
7
- from functools import wraps
8
7
  from pathlib import Path
9
- from time import time
10
8
  from typing import Any, Awaitable, Callable, Optional, TypeVar
11
9
 
12
10
  from tqdm import tqdm
13
11
  from tqdm.asyncio import tqdm as tqdm_asyncio
14
12
 
15
- from unstructured_ingest.v2.interfaces import BaseProcess, ProcessorConfig
13
+ from unstructured_ingest.v2.interfaces import BaseProcess, ProcessorConfig, Uploader
16
14
  from unstructured_ingest.v2.logger import logger, make_default_logger
15
+ from unstructured_ingest.v2.otel import OtelHandler
16
+ from unstructured_ingest.v2.pipeline.otel import instrument
17
17
 
18
18
  BaseProcessT = TypeVar("BaseProcessT", bound=BaseProcess)
19
19
  iterable_input = list[dict[str, Any]]
20
20
 
21
21
 
22
- def timed(func):
23
- @wraps(func)
24
- def time_it(self, *args, **kwargs):
25
- start = time()
26
- try:
27
- return func(self, *args, **kwargs)
28
- finally:
29
- if func.__name__ == "__call__":
30
- reported_name = f"{self.__class__.__name__} [cls]"
31
- else:
32
- reported_name = func.__name__
33
- logger.info(f"{reported_name} took {time() - start} seconds")
34
-
35
- return time_it
36
-
37
-
38
22
  @dataclass
39
23
  class PipelineStep(ABC):
40
24
  process: BaseProcessT
@@ -97,9 +81,15 @@ class PipelineStep(ABC):
97
81
  return self.process_serially(iterable)
98
82
  with mp.Pool(
99
83
  processes=self.context.num_processes,
100
- initializer=self._init_logger,
101
- initargs=(logging.DEBUG if self.context.verbose else logging.INFO,),
84
+ initializer=self._init_mp,
85
+ initargs=(
86
+ logging.DEBUG if self.context.verbose else logging.INFO,
87
+ self.context.otel_endpoint,
88
+ ),
102
89
  ) as pool:
90
+ otel_context = OtelHandler.inject_context()
91
+ for iter in iterable:
92
+ iter[OtelHandler.trace_context_key] = otel_context
103
93
  if self.context.tqdm:
104
94
  return list(
105
95
  tqdm(
@@ -115,11 +105,13 @@ class PipelineStep(ABC):
115
105
  # Allow mapping of kwargs via multiprocessing map()
116
106
  return self.run(**input_kwargs)
117
107
 
118
- def _init_logger(self, log_level: int):
108
+ def _init_mp(self, log_level: int, endpoint: Optional[str] = None) -> None:
119
109
  # Init logger for each spawned process when using multiprocessing pool
120
110
  make_default_logger(level=log_level)
111
+ otel_handler = OtelHandler(otel_endpoint=endpoint, log_out=logger.debug)
112
+ otel_handler.init_trace()
121
113
 
122
- @timed
114
+ @instrument()
123
115
  def __call__(self, iterable: Optional[iterable_input] = None) -> Any:
124
116
  iterable = iterable or []
125
117
  if iterable:
@@ -141,9 +133,19 @@ class PipelineStep(ABC):
141
133
  raise NotImplementedError
142
134
 
143
135
  def run(self, _fn: Optional[Callable] = None, **kwargs: Any) -> Optional[Any]:
136
+ kwargs = kwargs.copy()
137
+ otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=logger.debug)
138
+ tracer = otel_handler.get_tracer()
139
+ if trace_context := kwargs.pop(otel_handler.trace_context_key, {}):
140
+ otel_handler.attach_context(trace_context=trace_context)
141
+ attributes = {}
142
+ if file_data_path := kwargs.get("file_data_path"):
143
+ attributes["file_id"] = Path(file_data_path).stem
144
144
  try:
145
- fn = _fn or self.process.run
146
- return self._run(fn=fn, **kwargs)
145
+ with tracer.start_as_current_span(self.identifier, record_exception=True) as span:
146
+ otel_handler.set_attributes(span, attributes)
147
+ fn = _fn or self.process.run
148
+ return self._run(fn=fn, **kwargs)
147
149
  except Exception as e:
148
150
  logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
149
151
  if "file_data_path" in kwargs:
@@ -153,9 +155,17 @@ class PipelineStep(ABC):
153
155
  return None
154
156
 
155
157
  async def run_async(self, _fn: Optional[Callable] = None, **kwargs: Any) -> Optional[Any]:
158
+ otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=logger.debug)
156
159
  try:
157
- fn = _fn or self.process.run_async
158
- return await self._run_async(fn=fn, **kwargs)
160
+ attributes = {}
161
+ if file_data_path := kwargs.get("file_data_path"):
162
+ attributes["file_id"] = Path(file_data_path).stem
163
+ with otel_handler.get_tracer().start_as_current_span(
164
+ self.identifier, record_exception=True
165
+ ) as span:
166
+ otel_handler.set_attributes(span, attributes)
167
+ fn = _fn or self.process.run_async
168
+ return await self._run_async(fn=fn, **kwargs)
159
169
  except Exception as e:
160
170
  logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
161
171
  if "file_data_path" in kwargs:
@@ -167,3 +177,26 @@ class PipelineStep(ABC):
167
177
  @property
168
178
  def cache_dir(self) -> Path:
169
179
  return Path(self.context.work_dir) / self.identifier
180
+
181
+
182
+ @dataclass
183
+ class BatchPipelineStep(PipelineStep, ABC):
184
+ process: Uploader
185
+
186
+ def __call__(self, iterable: Optional[iterable_input] = None) -> Any:
187
+ if self.context.mp_supported and self.process.is_batch():
188
+ return self.run_batch(contents=iterable)
189
+ super().__call__(iterable=iterable)
190
+
191
+ @abstractmethod
192
+ def _run_batch(self, contents: iterable_input, **kwargs) -> Any:
193
+ pass
194
+
195
+ def run_batch(self, contents: iterable_input, **kwargs) -> Any:
196
+ try:
197
+ return self._run_batch(contents=contents, **kwargs)
198
+ except Exception as e:
199
+ self.context.status[self.identifier] = {"step_error": str(e)}
200
+ if self.context.raise_on_error:
201
+ raise e
202
+ return None
@@ -0,0 +1,32 @@
1
+ from functools import wraps
2
+ from typing import Callable, Optional
3
+
4
+ from unstructured_ingest.v2.logger import logger
5
+ from unstructured_ingest.v2.otel import OtelHandler
6
+
7
+
8
+ def instrument(
9
+ span_name: Optional[str] = None,
10
+ record_exception: bool = True,
11
+ attributes: dict[str, str] = None,
12
+ log_out: Callable = logger.info,
13
+ ) -> Callable[[Callable], Callable]:
14
+ def span_decorator(func: Callable) -> Callable:
15
+ def get_name(self) -> str:
16
+ if span_name:
17
+ return span_name
18
+ return f"{self.identifier} step"
19
+
20
+ @wraps(func)
21
+ def wrap_with_span(self, *args, **kwargs):
22
+ name = get_name(self=self)
23
+ otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=log_out)
24
+ with otel_handler.get_tracer().start_as_current_span(
25
+ name, record_exception=record_exception
26
+ ) as span:
27
+ otel_handler.set_attributes(span, attributes)
28
+ return func(self, *args, **kwargs)
29
+
30
+ return wrap_with_span
31
+
32
+ return span_decorator
@@ -1,11 +1,11 @@
1
1
  import logging
2
2
  import multiprocessing as mp
3
3
  from dataclasses import InitVar, dataclass, field
4
- from time import time
5
4
  from typing import Any, Optional, Union
6
5
 
7
- from unstructured_ingest.v2.interfaces import ProcessorConfig
6
+ from unstructured_ingest.v2.interfaces import ProcessorConfig, Uploader
8
7
  from unstructured_ingest.v2.logger import logger, make_default_logger
8
+ from unstructured_ingest.v2.otel import OtelHandler
9
9
  from unstructured_ingest.v2.pipeline.steps.chunk import Chunker, ChunkStep
10
10
  from unstructured_ingest.v2.pipeline.steps.download import DownloaderT, DownloadStep
11
11
  from unstructured_ingest.v2.pipeline.steps.embed import Embedder, EmbedStep
@@ -14,7 +14,7 @@ from unstructured_ingest.v2.pipeline.steps.index import IndexerT, IndexStep
14
14
  from unstructured_ingest.v2.pipeline.steps.partition import Partitioner, PartitionStep
15
15
  from unstructured_ingest.v2.pipeline.steps.stage import UploadStager, UploadStageStep
16
16
  from unstructured_ingest.v2.pipeline.steps.uncompress import Uncompressor, UncompressStep
17
- from unstructured_ingest.v2.pipeline.steps.upload import Uploader, UploadStep
17
+ from unstructured_ingest.v2.pipeline.steps.upload import UploadStep
18
18
  from unstructured_ingest.v2.processes.chunker import ChunkerConfig
19
19
  from unstructured_ingest.v2.processes.connector_registry import (
20
20
  ConnectionConfig,
@@ -77,6 +77,8 @@ class Pipeline:
77
77
  filterer: Filterer = None,
78
78
  ):
79
79
  make_default_logger(level=logging.DEBUG if self.context.verbose else logging.INFO)
80
+ otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint)
81
+ otel_handler.init_trace()
80
82
  self.indexer_step = IndexStep(process=indexer, context=self.context)
81
83
  self.downloader_step = DownloadStep(process=downloader, context=self.context)
82
84
  self.filter_step = FilterStep(process=filterer, context=self.context) if filterer else None
@@ -121,11 +123,13 @@ class Pipeline:
121
123
  logger.error(f"{k}: [{kk}] {vv}")
122
124
 
123
125
  def run(self):
126
+ otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=logger.info)
124
127
  try:
125
- start_time = time()
126
- self._run_prechecks()
127
- self._run()
128
- logger.info(f"Finished ingest process in {time() - start_time}s")
128
+ with otel_handler.get_tracer().start_as_current_span(
129
+ "ingest process", record_exception=True
130
+ ):
131
+ self._run_prechecks()
132
+ self._run()
129
133
  finally:
130
134
  self.log_statuses()
131
135
  self.cleanup()
@@ -6,6 +6,7 @@ from typing import Generator, Optional, TypeVar
6
6
  from unstructured_ingest.v2.interfaces.indexer import Indexer
7
7
  from unstructured_ingest.v2.logger import logger
8
8
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
9
+ from unstructured_ingest.v2.pipeline.otel import instrument
9
10
  from unstructured_ingest.v2.utils import serialize_base_model_json
10
11
 
11
12
  IndexerT = TypeVar("IndexerT", bound=Indexer)
@@ -31,6 +32,7 @@ class IndexStep(PipelineStep):
31
32
  f"connection configs: {connection_config}"
32
33
  )
33
34
 
35
+ @instrument(span_name=STEP_ID)
34
36
  def run(self) -> Generator[str, None, None]:
35
37
  for file_data in self.process.run():
36
38
  logger.debug(f"Generated file data: {file_data.to_dict()}")
@@ -4,9 +4,10 @@ from pathlib import Path
4
4
  from typing import Callable, Optional, TypedDict
5
5
 
6
6
  from unstructured_ingest.v2.interfaces import FileData
7
- from unstructured_ingest.v2.interfaces.uploader import UploadContent, Uploader
7
+ from unstructured_ingest.v2.interfaces.uploader import UploadContent
8
8
  from unstructured_ingest.v2.logger import logger
9
- from unstructured_ingest.v2.pipeline.interfaces import PipelineStep, iterable_input, timed
9
+ from unstructured_ingest.v2.pipeline.interfaces import BatchPipelineStep
10
+ from unstructured_ingest.v2.pipeline.otel import instrument
10
11
 
11
12
  STEP_ID = "upload"
12
13
 
@@ -17,8 +18,7 @@ class UploadStepContent(TypedDict):
17
18
 
18
19
 
19
20
  @dataclass
20
- class UploadStep(PipelineStep):
21
- process: Uploader
21
+ class UploadStep(BatchPipelineStep):
22
22
  identifier: str = STEP_ID
23
23
 
24
24
  def __str__(self):
@@ -34,25 +34,13 @@ class UploadStep(PipelineStep):
34
34
  f"connection configs: {connection_config}"
35
35
  )
36
36
 
37
- def process_whole(self, iterable: iterable_input):
38
- self.run(contents=iterable)
39
-
40
- @timed
41
- def __call__(self, iterable: iterable_input):
42
- logger.info(
43
- f"Calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore
44
- )
45
- if self.process.is_async():
46
- self.process_async(iterable=iterable)
47
- else:
48
- self.process_whole(iterable=iterable)
49
-
50
- def _run(self, fn: Callable, contents: list[UploadStepContent]):
37
+ @instrument(span_name=STEP_ID)
38
+ def _run_batch(self, contents: list[UploadStepContent]) -> None:
51
39
  upload_contents = [
52
40
  UploadContent(path=Path(c["path"]), file_data=FileData.from_file(c["file_data_path"]))
53
41
  for c in contents
54
42
  ]
55
- fn(contents=upload_contents)
43
+ self.process.run_batch(contents=upload_contents)
56
44
 
57
45
  async def _run_async(self, path: str, file_data_path: str, fn: Optional[Callable] = None):
58
46
  fn = fn or self.process.run_async
@@ -14,7 +14,6 @@ from unstructured_ingest.v2.interfaces import (
14
14
  AccessConfig,
15
15
  ConnectionConfig,
16
16
  FileData,
17
- UploadContent,
18
17
  Uploader,
19
18
  UploaderConfig,
20
19
  UploadStager,
@@ -139,13 +138,9 @@ class AstraDBUploader(Uploader):
139
138
  )
140
139
  return astra_db_collection
141
140
 
142
- def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
143
- elements_dict = []
144
- for content in contents:
145
- with open(content.path) as elements_file:
146
- elements = json.load(elements_file)
147
- elements_dict.extend(elements)
148
-
141
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
142
+ with path.open("r") as file:
143
+ elements_dict = json.load(file)
149
144
  logger.info(
150
145
  f"writing {len(elements_dict)} objects to destination "
151
146
  f"collection {self.upload_config.collection_name}"
@@ -12,7 +12,7 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
12
12
  from unstructured_ingest.v2.interfaces import (
13
13
  AccessConfig,
14
14
  ConnectionConfig,
15
- UploadContent,
15
+ FileData,
16
16
  Uploader,
17
17
  UploaderConfig,
18
18
  UploadStager,
@@ -192,14 +192,9 @@ class AzureCognitiveSearchUploader(Uploader):
192
192
  def write_dict_wrapper(self, elements_dict):
193
193
  return self.write_dict(elements_dict=elements_dict)
194
194
 
195
- def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
196
-
197
- elements_dict = []
198
- for content in contents:
199
- with open(content.path) as elements_file:
200
- elements = json.load(elements_file)
201
- elements_dict.extend(elements)
202
-
195
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
196
+ with path.open("r") as file:
197
+ elements_dict = json.load(file)
203
198
  logger.info(
204
199
  f"writing document batches to destination"
205
200
  f" endpoint at {str(self.connection_config.endpoint)}"
@@ -15,7 +15,6 @@ from unstructured_ingest.v2.interfaces import (
15
15
  AccessConfig,
16
16
  ConnectionConfig,
17
17
  FileData,
18
- UploadContent,
19
18
  Uploader,
20
19
  UploaderConfig,
21
20
  UploadStager,
@@ -186,13 +185,9 @@ class ChromaUploader(Uploader):
186
185
  )
187
186
  return chroma_dict
188
187
 
189
- def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
190
-
191
- elements_dict = []
192
- for content in contents:
193
- with open(content.path) as elements_file:
194
- elements = json.load(elements_file)
195
- elements_dict.extend(elements)
188
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
189
+ with path.open("r") as file:
190
+ elements_dict = json.load(file)
196
191
 
197
192
  logger.info(
198
193
  f"writing {len(elements_dict)} objects to destination "
@@ -26,7 +26,6 @@ from unstructured_ingest.v2.interfaces import (
26
26
  FileDataSourceMetadata,
27
27
  Indexer,
28
28
  IndexerConfig,
29
- UploadContent,
30
29
  Uploader,
31
30
  UploaderConfig,
32
31
  UploadStager,
@@ -134,14 +133,11 @@ class CouchbaseUploader(Uploader):
134
133
  logger.error(f"Failed to validate connection {e}", exc_info=True)
135
134
  raise DestinationConnectionError(f"failed to validate connection: {e}")
136
135
 
137
- def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
138
- elements = []
139
- for content in contents:
140
- with open(content.path) as elements_file:
141
- elements.extend(json.load(elements_file))
142
-
136
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
137
+ with path.open("r") as file:
138
+ elements_dict = json.load(file)
143
139
  logger.info(
144
- f"writing {len(elements)} objects to destination "
140
+ f"writing {len(elements_dict)} objects to destination "
145
141
  f"bucket, {self.connection_config.bucket} "
146
142
  f"at {self.connection_config.connection_string}",
147
143
  )
@@ -150,7 +146,7 @@ class CouchbaseUploader(Uploader):
150
146
  scope = bucket.scope(self.connection_config.scope)
151
147
  collection = scope.collection(self.connection_config.collection)
152
148
 
153
- for chunk in batch_generator(elements, self.upload_config.batch_size):
149
+ for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
154
150
  collection.upsert_multi({doc_id: doc for doc in chunk for doc_id, doc in doc.items()})
155
151
 
156
152
 
@@ -1,5 +1,6 @@
1
1
  import os
2
2
  from dataclasses import dataclass
3
+ from pathlib import Path
3
4
  from typing import TYPE_CHECKING, Any, Optional
4
5
 
5
6
  from pydantic import Field, Secret
@@ -9,7 +10,7 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
9
10
  from unstructured_ingest.v2.interfaces import (
10
11
  AccessConfig,
11
12
  ConnectionConfig,
12
- UploadContent,
13
+ FileData,
13
14
  Uploader,
14
15
  UploaderConfig,
15
16
  )
@@ -142,15 +143,13 @@ class DatabricksVolumesUploader(Uploader):
142
143
  logger.error(f"failed to validate connection: {e}", exc_info=True)
143
144
  raise DestinationConnectionError(f"failed to validate connection: {e}")
144
145
 
145
- def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
146
- for content in contents:
147
- with open(content.path, "rb") as elements_file:
148
- output_path = os.path.join(self.upload_config.path, content.path.name)
149
- self.get_client().files.upload(
150
- file_path=output_path,
151
- contents=elements_file,
152
- overwrite=self.upload_config.overwrite,
153
- )
146
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
147
+ output_path = os.path.join(self.upload_config.path, path.name)
148
+ self.get_client().files.upload(
149
+ file_path=output_path,
150
+ contents=path,
151
+ overwrite=self.upload_config.overwrite,
152
+ )
154
153
 
155
154
 
156
155
  databricks_volumes_destination_entry = DestinationRegistryEntry(
@@ -26,7 +26,6 @@ from unstructured_ingest.v2.interfaces import (
26
26
  FileDataSourceMetadata,
27
27
  Indexer,
28
28
  IndexerConfig,
29
- UploadContent,
30
29
  Uploader,
31
30
  UploaderConfig,
32
31
  UploadStager,
@@ -384,14 +383,12 @@ class ElasticsearchUploader(Uploader):
384
383
 
385
384
  return parallel_bulk
386
385
 
387
- def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
386
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
388
387
  parallel_bulk = self.load_parallel_bulk()
389
- elements_dict = []
390
- for content in contents:
391
- with open(content.path) as elements_file:
392
- elements = json.load(elements_file)
393
- elements_dict.extend(elements)
388
+ with path.open("r") as file:
389
+ elements_dict = json.load(file)
394
390
  upload_destination = self.connection_config.hosts or self.connection_config.cloud_id
391
+
395
392
  logger.info(
396
393
  f"writing {len(elements_dict)} elements via document batches to destination "
397
394
  f"index named {self.upload_config.index_name} at {upload_destination} with "
@@ -7,7 +7,7 @@ from typing import Any, Generator, Optional
7
7
  from pydantic import Field, Secret
8
8
 
9
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
10
+ from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
11
11
  from unstructured_ingest.v2.processes.connector_registry import (
12
12
  DestinationRegistryEntry,
13
13
  SourceRegistryEntry,
@@ -152,8 +152,8 @@ class AzureUploader(FsspecUploader):
152
152
  super().precheck()
153
153
 
154
154
  @requires_dependencies(["adlfs", "fsspec"], extras="azure")
155
- def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
156
- return super().run(contents=contents, **kwargs)
155
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
156
+ return super().run(path=path, file_data=file_data, **kwargs)
157
157
 
158
158
  @requires_dependencies(["adlfs", "fsspec"], extras="azure")
159
159
  async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None: