unstructured-ingest 0.0.14__py3-none-any.whl → 0.0.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (70) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/interfaces.py +1 -1
  3. unstructured_ingest/cli/utils.py +1 -1
  4. unstructured_ingest/connector/astradb.py +1 -1
  5. unstructured_ingest/connector/biomed.py +4 -4
  6. unstructured_ingest/connector/chroma.py +1 -1
  7. unstructured_ingest/connector/databricks_volumes.py +2 -2
  8. unstructured_ingest/connector/fsspec/box.py +1 -1
  9. unstructured_ingest/connector/fsspec/fsspec.py +5 -5
  10. unstructured_ingest/connector/git.py +1 -1
  11. unstructured_ingest/connector/google_drive.py +4 -4
  12. unstructured_ingest/connector/hubspot.py +1 -1
  13. unstructured_ingest/connector/kafka.py +8 -8
  14. unstructured_ingest/connector/local.py +1 -1
  15. unstructured_ingest/connector/notion/helpers.py +4 -4
  16. unstructured_ingest/connector/onedrive.py +3 -3
  17. unstructured_ingest/connector/outlook.py +2 -2
  18. unstructured_ingest/connector/pinecone.py +1 -1
  19. unstructured_ingest/connector/sharepoint.py +8 -8
  20. unstructured_ingest/connector/vectara.py +6 -6
  21. unstructured_ingest/interfaces.py +4 -4
  22. unstructured_ingest/logger.py +1 -1
  23. unstructured_ingest/pipeline/copy.py +1 -1
  24. unstructured_ingest/pipeline/interfaces.py +2 -2
  25. unstructured_ingest/pipeline/partition.py +1 -1
  26. unstructured_ingest/pipeline/pipeline.py +1 -1
  27. unstructured_ingest/pipeline/reformat/chunking.py +2 -2
  28. unstructured_ingest/pipeline/reformat/embedding.py +1 -1
  29. unstructured_ingest/pipeline/source.py +2 -2
  30. unstructured_ingest/utils/compression.py +3 -3
  31. unstructured_ingest/utils/string_and_date_utils.py +2 -2
  32. unstructured_ingest/v2/cli/base/cmd.py +3 -3
  33. unstructured_ingest/v2/cli/base/dest.py +1 -1
  34. unstructured_ingest/v2/cli/base/src.py +1 -1
  35. unstructured_ingest/v2/cli/utils/click.py +1 -1
  36. unstructured_ingest/v2/interfaces/processor.py +48 -13
  37. unstructured_ingest/v2/logger.py +1 -1
  38. unstructured_ingest/v2/otel.py +1 -1
  39. unstructured_ingest/v2/pipeline/interfaces.py +9 -2
  40. unstructured_ingest/v2/pipeline/pipeline.py +17 -6
  41. unstructured_ingest/v2/pipeline/steps/chunk.py +3 -3
  42. unstructured_ingest/v2/pipeline/steps/download.py +17 -2
  43. unstructured_ingest/v2/pipeline/steps/embed.py +3 -3
  44. unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
  45. unstructured_ingest/v2/pipeline/steps/index.py +2 -2
  46. unstructured_ingest/v2/pipeline/steps/partition.py +3 -3
  47. unstructured_ingest/v2/pipeline/steps/stage.py +1 -1
  48. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
  49. unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
  50. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  51. unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -1
  52. unstructured_ingest/v2/processes/connectors/fsspec/box.py +1 -1
  53. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +4 -4
  54. unstructured_ingest/v2/processes/connectors/google_drive.py +1 -1
  55. unstructured_ingest/v2/processes/connectors/local.py +6 -5
  56. unstructured_ingest/v2/processes/connectors/milvus.py +1 -1
  57. unstructured_ingest/v2/processes/connectors/onedrive.py +2 -2
  58. unstructured_ingest/v2/processes/connectors/opensearch.py +1 -1
  59. unstructured_ingest/v2/processes/connectors/pinecone.py +2 -2
  60. unstructured_ingest/v2/processes/connectors/sharepoint.py +9 -5
  61. unstructured_ingest/v2/processes/filter.py +1 -1
  62. unstructured_ingest/v2/processes/partitioner.py +3 -3
  63. unstructured_ingest/v2/utils.py +7 -0
  64. {unstructured_ingest-0.0.14.dist-info → unstructured_ingest-0.0.16.dist-info}/METADATA +272 -274
  65. {unstructured_ingest-0.0.14.dist-info → unstructured_ingest-0.0.16.dist-info}/RECORD +69 -69
  66. unstructured_ingest/evaluate.py +0 -338
  67. {unstructured_ingest-0.0.14.dist-info → unstructured_ingest-0.0.16.dist-info}/LICENSE.md +0 -0
  68. {unstructured_ingest-0.0.14.dist-info → unstructured_ingest-0.0.16.dist-info}/WHEEL +0 -0
  69. {unstructured_ingest-0.0.14.dist-info → unstructured_ingest-0.0.16.dist-info}/entry_points.txt +0 -0
  70. {unstructured_ingest-0.0.14.dist-info → unstructured_ingest-0.0.16.dist-info}/top_level.txt +0 -0
@@ -10,13 +10,13 @@ def json_to_dict(json_string: str) -> t.Union[str, t.Dict[str, t.Any]]:
10
10
  try:
11
11
  return json.loads(json_string)
12
12
  except json.JSONDecodeError:
13
- # Not neccessary an error if it is a path or malformed json
13
+ # Not necessary an error if it is a path or malformed json
14
14
  pass
15
15
  try:
16
16
  # This is common when single quotes are used instead of double quotes
17
17
  return json.loads(json_string.replace("'", '"'))
18
18
  except json.JSONDecodeError:
19
- # Not neccessary an error if it is a path
19
+ # Not necessary an error if it is a path
20
20
  pass
21
21
  return json_string
22
22
 
@@ -102,7 +102,7 @@ class BaseCmd(ABC):
102
102
  cmd.params.extend(options)
103
103
  return cmd
104
104
 
105
- def get_pipline(
105
+ def get_pipeline(
106
106
  self,
107
107
  src: str,
108
108
  source_options: dict[str, Any],
@@ -122,7 +122,7 @@ class BaseCmd(ABC):
122
122
  pipeline_kwargs["chunker"] = chunker
123
123
  if filterer := self.get_filterer(options=source_options):
124
124
  pipeline_kwargs["filterer"] = filterer
125
- if embedder := self.get_embeder(options=source_options):
125
+ if embedder := self.get_embedder(options=source_options):
126
126
  pipeline_kwargs["embedder"] = embedder
127
127
  if dest:
128
128
  logger.debug(
@@ -160,7 +160,7 @@ class BaseCmd(ABC):
160
160
  return Filterer(config=filterer_configs)
161
161
 
162
162
  @staticmethod
163
- def get_embeder(options: dict[str, Any]) -> Optional[Embedder]:
163
+ def get_embedder(options: dict[str, Any]) -> Optional[Embedder]:
164
164
  embedder_config = extract_config(flat_data=options, config=EmbedderConfig)
165
165
  if not embedder_config.embedding_provider:
166
166
  return None
@@ -40,7 +40,7 @@ class DestCmd(BaseCmd):
40
40
  source_options: dict = ctx.parent.params if ctx.parent else {}
41
41
  conform_click_options(options)
42
42
  try:
43
- pipeline = self.get_pipline(
43
+ pipeline = self.get_pipeline(
44
44
  src=source_cmd,
45
45
  source_options=source_options,
46
46
  dest=self.cmd_name,
@@ -55,7 +55,7 @@ class SrcCmd(BaseCmd):
55
55
  conform_click_options(options)
56
56
  logger.setLevel(logging.DEBUG if options.get("verbose", False) else logging.INFO)
57
57
  try:
58
- pipeline = self.get_pipline(src=self.cmd_name, source_options=options)
58
+ pipeline = self.get_pipeline(src=self.cmd_name, source_options=options)
59
59
  pipeline.run()
60
60
  except Exception as e:
61
61
  logger.error(f"failed to run source command {self.cmd_name}: {e}", exc_info=True)
@@ -67,7 +67,7 @@ class FileOrJson(click.ParamType):
67
67
  return value
68
68
  self.fail(
69
69
  gettext(
70
- "{value} is not a valid json string nor an existing filepath.",
70
+ "{value} is neither a valid json string nor an existing filepath.",
71
71
  ).format(value=value),
72
72
  param,
73
73
  ctx,
@@ -11,21 +11,56 @@ DEFAULT_WORK_DIR = str((Path.home() / ".cache" / "unstructured" / "ingest" / "pi
11
11
  class ProcessorConfig(BaseModel):
12
12
  model_config = ConfigDict(arbitrary_types_allowed=True)
13
13
 
14
- reprocess: bool = False
15
- verbose: bool = False
16
- tqdm: bool = False
17
- work_dir: str = Field(default_factory=lambda: DEFAULT_WORK_DIR)
18
- num_processes: int = 2
19
- max_connections: Optional[int] = None
20
- raise_on_error: bool = False
14
+ reprocess: bool = Field(
15
+ default=False,
16
+ description="Reprocess a downloaded file even if the relevant structured "
17
+ "output .json file in output directory already exists.",
18
+ )
19
+ verbose: bool = Field(default=False)
20
+ tqdm: bool = Field(default=False, description="Display tqdm progress bar")
21
+ work_dir: str = Field(
22
+ default_factory=lambda: DEFAULT_WORK_DIR,
23
+ description="Where to place working files when processing each step",
24
+ )
25
+ num_processes: int = Field(
26
+ default=2, description="Number of parallel processes with which to process docs"
27
+ )
28
+ max_connections: Optional[int] = Field(
29
+ default=None, description="Limit of concurrent connectionts"
30
+ )
31
+ raise_on_error: bool = Field(
32
+ default=False,
33
+ description="Is set, will raise error if any doc in the pipeline fail. "
34
+ "Otherwise will log error and continue with other docs",
35
+ )
21
36
  disable_parallelism: bool = Field(
22
- default_factory=lambda: os.getenv("INGEST_DISABLE_PARALLELISM", "false").lower() == "true"
37
+ default_factory=lambda: os.getenv("INGEST_DISABLE_PARALLELISM", "false").lower() == "true",
38
+ )
39
+ preserve_downloads: bool = Field(
40
+ default=False, description="Don't delete downloaded files after process completes"
41
+ )
42
+ download_only: bool = Field(
43
+ default=False, description="skip the rest of the process after files are downloaded"
44
+ )
45
+ re_download: bool = Field(
46
+ default=False,
47
+ description="If set, will re-download downloaded files "
48
+ "regardless of if they already exist locally",
49
+ )
50
+ uncompress: bool = Field(
51
+ default=False,
52
+ description="Uncompress any archived files. Currently supporting "
53
+ "zip and tar files based on file extension.",
54
+ )
55
+ iter_delete: bool = Field(
56
+ default=False,
57
+ description="If limited on memory, this can be enabled to delete "
58
+ "cached content as it's used and no longer needed in the pipeline.",
59
+ )
60
+ delete_cache: bool = Field(
61
+ default=False,
62
+ description="If set, will delete the cache work directory when process finishes",
23
63
  )
24
- preserve_downloads: bool = False
25
- download_only: bool = False
26
- max_docs: Optional[int] = None
27
- re_download: bool = False
28
- uncompress: bool = False
29
64
 
30
65
  # OTEL support
31
66
  otel_endpoint: Optional[str] = Field(
@@ -101,7 +101,7 @@ class SensitiveFormatter(Formatter):
101
101
 
102
102
 
103
103
  def remove_root_handlers(logger: Logger) -> None:
104
- # NOTE(robinson) - in some environments such as Google Colab, there is a root handler
104
+ # NOTE(robinson): in some environments such as Google Colab, there is a root handler
105
105
  # that doesn't not mask secrets, meaning sensitive info such as api keys appear in logs.
106
106
  # Removing these when they exist prevents this behavior
107
107
  if logger.root.hasHandlers():
@@ -92,7 +92,7 @@ class OtelHandler:
92
92
  return None
93
93
  from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
94
94
 
95
- logger.debug(f"Adding otel exported at {otel_endpoint}")
95
+ logger.debug(f"adding otel exported at {otel_endpoint}")
96
96
  trace_exporter = OTLPSpanExporter()
97
97
  processor = SimpleSpanProcessor(trace_exporter)
98
98
  provider.add_span_processor(processor)
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
  import asyncio
4
4
  import logging
5
5
  import multiprocessing as mp
6
+ import shutil
6
7
  from abc import ABC, abstractmethod
7
8
  from concurrent.futures import ThreadPoolExecutor
8
9
  from dataclasses import dataclass
@@ -118,10 +119,10 @@ class PipelineStep(ABC):
118
119
  iterable = iterable or []
119
120
  if iterable:
120
121
  logger.info(
121
- f"Calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore
122
+ f"calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore
122
123
  )
123
124
  else:
124
- logger.info(f"Calling {self.__class__.__name__} with no inputs")
125
+ logger.info(f"calling {self.__class__.__name__} with no inputs")
125
126
  if self.context.async_supported and self.process.is_async():
126
127
  return self.process_async(iterable=iterable)
127
128
  if self.context.mp_supported:
@@ -180,6 +181,12 @@ class PipelineStep(ABC):
180
181
  def cache_dir(self) -> Path:
181
182
  return Path(self.context.work_dir) / self.identifier
182
183
 
184
+ def delete_cache(self):
185
+ if self.context.iter_delete and self.cache_dir.exists():
186
+ cache_dir = self.cache_dir
187
+ logger.info(f"deleting {self.identifier} cache dir {cache_dir}")
188
+ shutil.rmtree(cache_dir)
189
+
183
190
 
184
191
  @dataclass
185
192
  class BatchPipelineStep(PipelineStep, ABC):
@@ -2,7 +2,9 @@ from __future__ import annotations
2
2
 
3
3
  import logging
4
4
  import multiprocessing as mp
5
+ import shutil
5
6
  from dataclasses import InitVar, dataclass, field
7
+ from pathlib import Path
6
8
  from typing import Any
7
9
 
8
10
  from unstructured_ingest.v2.interfaces import ProcessorConfig, Uploader
@@ -115,7 +117,9 @@ class Pipeline:
115
117
  )
116
118
 
117
119
  def cleanup(self):
118
- pass
120
+ if self.context.delete_cache and Path(self.context.work_dir).exists():
121
+ logger.info(f"deleting cache directory: {self.context.work_dir}")
122
+ shutil.rmtree(self.context.work_dir)
119
123
 
120
124
  def log_statuses(self):
121
125
  if status := self.context.status:
@@ -183,7 +187,7 @@ class Pipeline:
183
187
  return filtered_records
184
188
 
185
189
  def _run(self):
186
- logger.info(f"Running local pipline: {self} with configs: " f"{self.context.json()}")
190
+ logger.info(f"running local pipeline: {self} with configs: " f"{self.context.json()}")
187
191
  if self.context.mp_supported:
188
192
  manager = mp.Manager()
189
193
  self.context.status = manager.dict()
@@ -228,26 +232,33 @@ class Pipeline:
228
232
  logger.info("No files to process after filtering uncompressed content, exiting")
229
233
  return
230
234
 
231
- if not downloaded_data:
235
+ if not downloaded_data or self.context.download_only:
232
236
  return
233
237
 
234
238
  # Partition content
235
239
  elements = self.partitioner_step(downloaded_data)
240
+ # Download data non longer needed, delete if possible
241
+ self.downloader_step.delete_cache()
236
242
  elements = self.clean_results(results=elements)
237
243
  if not elements:
238
244
  logger.info("No files to process after partitioning, exiting")
239
245
  return
240
246
 
241
247
  # Run element specific modifiers
242
- for step in [self.chunker_step, self.embedder_step, self.stager_step]:
243
- elements = step(elements) if step else elements
248
+ last_step = self.partitioner_step
249
+ for step in [s for s in [self.chunker_step, self.embedder_step, self.stager_step] if s]:
250
+ elements = step(elements)
244
251
  elements = self.clean_results(results=elements)
252
+ # Delete data from previous step if possible since no longer needed
253
+ last_step.delete_cache()
254
+ last_step = step
245
255
  if not elements:
246
- logger.info(f"No files to process after {step.__class__.__name__}, exiting")
256
+ logger.info(f"no files to process after {step.__class__.__name__}, exiting")
247
257
  return
248
258
 
249
259
  # Upload the final result
250
260
  self.uploader_step(iterable=elements)
261
+ last_step.delete_cache()
251
262
 
252
263
  def __str__(self):
253
264
  s = [str(self.indexer_step)]
@@ -29,7 +29,7 @@ class ChunkStep(PipelineStep):
29
29
 
30
30
  def __post_init__(self):
31
31
  config = self.process.config.json() if self.process.config else None
32
- logger.info(f"Created {self.identifier} with configs: {config}")
32
+ logger.info(f"created {self.identifier} with configs: {config}")
33
33
 
34
34
  def should_chunk(self, filepath: Path, file_data: FileData) -> bool:
35
35
  if self.context.reprocess or file_data.reprocess:
@@ -44,7 +44,7 @@ class ChunkStep(PipelineStep):
44
44
 
45
45
  def _save_output(self, output_filepath: str, chunked_content: list[dict]):
46
46
  with open(str(output_filepath), "w") as f:
47
- logger.debug(f"Writing chunker output to: {output_filepath}")
47
+ logger.debug(f"writing chunker output to: {output_filepath}")
48
48
  json.dump(chunked_content, f, indent=2)
49
49
 
50
50
  async def _run_async(
@@ -54,7 +54,7 @@ class ChunkStep(PipelineStep):
54
54
  file_data = FileData.from_file(path=file_data_path)
55
55
  output_filepath = self.get_output_filepath(filename=path)
56
56
  if not self.should_chunk(filepath=output_filepath, file_data=file_data):
57
- logger.debug(f"Skipping chunking, output already exists: {output_filepath}")
57
+ logger.debug(f"skipping chunking, output already exists: {output_filepath}")
58
58
  return ChunkStepResponse(file_data_path=file_data_path, path=str(output_filepath))
59
59
  fn_kwargs = {"elements_filepath": path}
60
60
  if not asyncio.iscoroutinefunction(fn):
@@ -1,6 +1,7 @@
1
1
  import asyncio
2
2
  import hashlib
3
3
  import json
4
+ import shutil
4
5
  from dataclasses import dataclass
5
6
  from pathlib import Path
6
7
  from typing import Callable, Optional, TypedDict, TypeVar
@@ -82,7 +83,7 @@ class DownloadStep(PipelineStep):
82
83
  f"match size of local file: {file_size_bytes}, updating"
83
84
  )
84
85
  file_data.metadata.filesize_bytes = file_size_bytes
85
- logger.debug(f"Updating file data with new content: {file_data.to_dict()}")
86
+ logger.debug(f"updating file data with new content: {file_data.to_dict()}")
86
87
  with file_data_path.open("w") as file:
87
88
  json.dump(file_data.to_dict(), file, indent=2)
88
89
 
@@ -90,7 +91,7 @@ class DownloadStep(PipelineStep):
90
91
  file_data = FileData.from_file(path=file_data_path)
91
92
  download_path = self.process.get_download_path(file_data=file_data)
92
93
  if not self.should_download(file_data=file_data, file_data_path=file_data_path):
93
- logger.debug(f"Skipping download, file already exists locally: {download_path}")
94
+ logger.debug(f"skipping download, file already exists locally: {download_path}")
94
95
  self.update_file_data(
95
96
  file_data=file_data,
96
97
  file_data_path=Path(file_data_path),
@@ -185,3 +186,17 @@ class DownloadStep(PipelineStep):
185
186
  if extras:
186
187
  hashable_string += "".join(extras)
187
188
  return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
189
+
190
+ @property
191
+ def cache_dir(self) -> Path:
192
+ return self.process.download_config.download_dir
193
+
194
+ def delete_cache(self):
195
+ if (
196
+ self.context.iter_delete
197
+ and not self.context.preserve_downloads
198
+ and self.cache_dir.exists()
199
+ ):
200
+ cache_dir = self.cache_dir
201
+ logger.info(f"deleting {self.identifier} cache dir {cache_dir}")
202
+ shutil.rmtree(cache_dir)
@@ -29,7 +29,7 @@ class EmbedStep(PipelineStep):
29
29
 
30
30
  def __post_init__(self):
31
31
  config = self.process.config.json() if self.process.config else None
32
- logger.info(f"Created {self.identifier} with configs: {config}")
32
+ logger.info(f"created {self.identifier} with configs: {config}")
33
33
 
34
34
  def should_embed(self, filepath: Path, file_data: FileData) -> bool:
35
35
  if self.context.reprocess or file_data.reprocess:
@@ -44,7 +44,7 @@ class EmbedStep(PipelineStep):
44
44
 
45
45
  def _save_output(self, output_filepath: str, embedded_content: list[dict]):
46
46
  with open(str(output_filepath), "w") as f:
47
- logger.debug(f"Writing embedded output to: {output_filepath}")
47
+ logger.debug(f"writing embedded output to: {output_filepath}")
48
48
  json.dump(embedded_content, f, indent=2)
49
49
 
50
50
  async def _run_async(self, fn: Callable, path: str, file_data_path: str) -> EmbedStepResponse:
@@ -52,7 +52,7 @@ class EmbedStep(PipelineStep):
52
52
  file_data = FileData.from_file(path=file_data_path)
53
53
  output_filepath = self.get_output_filepath(filename=path)
54
54
  if not self.should_embed(filepath=output_filepath, file_data=file_data):
55
- logger.debug(f"Skipping embedding, output already exists: {output_filepath}")
55
+ logger.debug(f"skipping embedding, output already exists: {output_filepath}")
56
56
  return EmbedStepResponse(file_data_path=file_data_path, path=str(output_filepath))
57
57
  fn_kwargs = {"elements_filepath": path}
58
58
  if not asyncio.iscoroutinefunction(fn):
@@ -17,7 +17,7 @@ class FilterStep(PipelineStep):
17
17
 
18
18
  def __post_init__(self):
19
19
  config = self.process.config.json() if self.process.config else None
20
- logger.info(f"Created {self.identifier} with configs: {config}")
20
+ logger.info(f"created {self.identifier} with configs: {config}")
21
21
 
22
22
  async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:
23
23
  file_data = FileData.from_file(path=file_data_path)
@@ -28,14 +28,14 @@ class IndexStep(PipelineStep):
28
28
  self.process.connection_config.json() if self.process.connection_config else None
29
29
  )
30
30
  logger.info(
31
- f"Created {self.identifier} with configs: {config}, "
31
+ f"created {self.identifier} with configs: {config}, "
32
32
  f"connection configs: {connection_config}"
33
33
  )
34
34
 
35
35
  @instrument(span_name=STEP_ID)
36
36
  def run(self) -> Generator[str, None, None]:
37
37
  for file_data in self.process.run():
38
- logger.debug(f"Generated file data: {file_data.to_dict()}")
38
+ logger.debug(f"generated file data: {file_data.to_dict()}")
39
39
  try:
40
40
  record_hash = self.get_hash(extras=[file_data.identifier])
41
41
  filename = f"{record_hash}.json"
@@ -29,7 +29,7 @@ class PartitionStep(PipelineStep):
29
29
 
30
30
  def __post_init__(self):
31
31
  config = self.process.config.json()
32
- logger.info(f"Created {self.identifier} with configs: {config}")
32
+ logger.info(f"created {self.identifier} with configs: {config}")
33
33
 
34
34
  def should_partition(self, filepath: Path, file_data: FileData) -> bool:
35
35
  if self.context.reprocess or file_data.reprocess:
@@ -44,7 +44,7 @@ class PartitionStep(PipelineStep):
44
44
 
45
45
  def _save_output(self, output_filepath: str, partitioned_content: list[dict]):
46
46
  with open(str(output_filepath), "w") as f:
47
- logger.debug(f"Writing partitioned output to: {output_filepath}")
47
+ logger.debug(f"writing partitioned output to: {output_filepath}")
48
48
  json.dump(partitioned_content, f, indent=2)
49
49
 
50
50
  async def _run_async(
@@ -54,7 +54,7 @@ class PartitionStep(PipelineStep):
54
54
  file_data = FileData.from_file(path=file_data_path)
55
55
  output_filepath = self.get_output_filepath(filename=Path(file_data_path))
56
56
  if not self.should_partition(filepath=output_filepath, file_data=file_data):
57
- logger.debug(f"Skipping partitioning, output already exists: {output_filepath}")
57
+ logger.debug(f"skipping partitioning, output already exists: {output_filepath}")
58
58
  return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
59
59
  fn_kwargs = {"filename": path, "metadata": file_data.metadata.to_dict()}
60
60
  if not asyncio.iscoroutinefunction(fn):
@@ -31,7 +31,7 @@ class UploadStageStep(PipelineStep):
31
31
  self.process.upload_stager_config.json() if self.process.upload_stager_config else None
32
32
  )
33
33
  self.cache_dir.mkdir(parents=True, exist_ok=True)
34
- logger.info(f"Created {self.identifier} with configs: {config}")
34
+ logger.info(f"created {self.identifier} with configs: {config}")
35
35
 
36
36
  async def _run_async(
37
37
  self, fn: Callable, path: str, file_data_path: str
@@ -23,7 +23,7 @@ class UncompressStep(PipelineStep):
23
23
 
24
24
  def __post_init__(self):
25
25
  config = self.process.config.json() if self.process.config else None
26
- logger.info(f"Created {self.identifier} with configs: {config}")
26
+ logger.info(f"created {self.identifier} with configs: {config}")
27
27
 
28
28
  async def _run_async(
29
29
  self, fn: Callable, path: str, file_data_path: str
@@ -6,6 +6,8 @@ from unstructured_ingest.v2.processes.connector_registry import (
6
6
  add_source_entry,
7
7
  )
8
8
 
9
+ from .airtable import CONNECTOR_TYPE as AIRTABLE_CONNECTOR_TYPE
10
+ from .airtable import airtable_source_entry
9
11
  from .astradb import CONNECTOR_TYPE as ASTRA_DB_CONNECTOR_TYPE
10
12
  from .astradb import astra_db_destination_entry
11
13
  from .azure_cognitive_search import CONNECTOR_TYPE as AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE
@@ -92,3 +94,4 @@ add_destination_entry(
92
94
  )
93
95
 
94
96
  add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destination_entry)
97
+ add_source_entry(source_type=AIRTABLE_CONNECTOR_TYPE, entry=airtable_source_entry)