unstructured-ingest 0.0.2.dev0__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (125) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/cli.py +6 -1
  3. unstructured_ingest/cli/cmds/__init__.py +4 -4
  4. unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
  5. unstructured_ingest/cli/interfaces.py +13 -6
  6. unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
  7. unstructured_ingest/connector/biomed.py +12 -5
  8. unstructured_ingest/connector/confluence.py +3 -3
  9. unstructured_ingest/connector/github.py +3 -2
  10. unstructured_ingest/connector/google_drive.py +1 -2
  11. unstructured_ingest/connector/mongodb.py +1 -2
  12. unstructured_ingest/connector/notion/client.py +31 -16
  13. unstructured_ingest/connector/notion/connector.py +3 -2
  14. unstructured_ingest/connector/registry.py +2 -2
  15. unstructured_ingest/connector/vectara.py +7 -2
  16. unstructured_ingest/interfaces.py +13 -9
  17. unstructured_ingest/pipeline/interfaces.py +8 -3
  18. unstructured_ingest/pipeline/reformat/chunking.py +13 -9
  19. unstructured_ingest/pipeline/reformat/embedding.py +3 -3
  20. unstructured_ingest/runner/__init__.py +2 -2
  21. unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
  22. unstructured_ingest/runner/writers/__init__.py +2 -2
  23. unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
  24. unstructured_ingest/utils/chunking.py +45 -0
  25. unstructured_ingest/utils/dep_check.py +1 -1
  26. unstructured_ingest/utils/google_filetype.py +9 -0
  27. unstructured_ingest/v2/cli/base/cmd.py +66 -12
  28. unstructured_ingest/v2/cli/base/dest.py +21 -12
  29. unstructured_ingest/v2/cli/base/src.py +35 -21
  30. unstructured_ingest/v2/cli/cmds.py +14 -0
  31. unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
  32. unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
  33. unstructured_ingest/v2/interfaces/__init__.py +2 -1
  34. unstructured_ingest/v2/interfaces/connector.py +5 -7
  35. unstructured_ingest/v2/interfaces/downloader.py +17 -8
  36. unstructured_ingest/v2/interfaces/file_data.py +13 -2
  37. unstructured_ingest/v2/interfaces/indexer.py +3 -4
  38. unstructured_ingest/v2/interfaces/process.py +3 -4
  39. unstructured_ingest/v2/interfaces/processor.py +10 -10
  40. unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
  41. unstructured_ingest/v2/interfaces/uploader.py +3 -3
  42. unstructured_ingest/v2/pipeline/interfaces.py +3 -5
  43. unstructured_ingest/v2/pipeline/pipeline.py +73 -7
  44. unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
  45. unstructured_ingest/v2/pipeline/steps/download.py +90 -24
  46. unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
  47. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  48. unstructured_ingest/v2/pipeline/steps/index.py +14 -10
  49. unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
  50. unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
  51. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
  52. unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
  53. unstructured_ingest/v2/processes/__init__.py +18 -0
  54. unstructured_ingest/v2/processes/chunker.py +74 -28
  55. unstructured_ingest/v2/processes/connector_registry.py +8 -2
  56. unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
  57. unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +53 -35
  58. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +38 -27
  59. unstructured_ingest/v2/processes/connectors/chroma.py +38 -27
  60. unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
  61. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +95 -31
  62. unstructured_ingest/v2/processes/connectors/elasticsearch.py +92 -53
  63. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +47 -16
  64. unstructured_ingest/v2/processes/connectors/fsspec/box.py +23 -13
  65. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +18 -11
  66. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +49 -61
  67. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +46 -13
  68. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +50 -20
  69. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +27 -28
  70. unstructured_ingest/v2/processes/connectors/google_drive.py +52 -42
  71. unstructured_ingest/v2/processes/connectors/local.py +36 -28
  72. unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
  73. unstructured_ingest/v2/processes/connectors/mongodb.py +32 -22
  74. unstructured_ingest/v2/processes/connectors/onedrive.py +31 -16
  75. unstructured_ingest/v2/processes/connectors/opensearch.py +81 -43
  76. unstructured_ingest/v2/processes/connectors/pinecone.py +29 -23
  77. unstructured_ingest/v2/processes/connectors/salesforce.py +36 -26
  78. unstructured_ingest/v2/processes/connectors/sharepoint.py +64 -33
  79. unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
  80. unstructured_ingest/v2/processes/connectors/sql.py +52 -39
  81. unstructured_ingest/v2/processes/connectors/weaviate.py +35 -18
  82. unstructured_ingest/v2/processes/embedder.py +106 -47
  83. unstructured_ingest/v2/processes/filter.py +60 -0
  84. unstructured_ingest/v2/processes/partitioner.py +79 -33
  85. unstructured_ingest/v2/processes/uncompress.py +3 -3
  86. unstructured_ingest/v2/utils.py +45 -0
  87. unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
  88. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +92 -116
  89. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
  90. unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
  91. unstructured_ingest/v2/cli/cmds/astra.py +0 -85
  92. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
  93. unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
  94. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
  95. unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
  96. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
  97. unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
  98. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
  99. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -77
  100. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
  101. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
  102. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
  103. unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
  104. unstructured_ingest/v2/cli/cmds/local.py +0 -60
  105. unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
  106. unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
  107. unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
  108. unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
  109. unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
  110. unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
  111. unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
  112. unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
  113. unstructured_ingest/v2/cli/cmds/sql.py +0 -84
  114. unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
  115. unstructured_ingest/v2/cli/configs/__init__.py +0 -6
  116. unstructured_ingest/v2/cli/configs/chunk.py +0 -89
  117. unstructured_ingest/v2/cli/configs/embed.py +0 -74
  118. unstructured_ingest/v2/cli/configs/partition.py +0 -99
  119. unstructured_ingest/v2/cli/configs/processor.py +0 -88
  120. unstructured_ingest/v2/cli/interfaces.py +0 -27
  121. unstructured_ingest/v2/pipeline/utils.py +0 -15
  122. unstructured_ingest-0.0.2.dev0.dist-info/METADATA +0 -321
  123. /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  124. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
  125. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
@@ -9,12 +9,12 @@ from unstructured_ingest.v2.logger import logger, make_default_logger
9
9
  from unstructured_ingest.v2.pipeline.steps.chunk import Chunker, ChunkStep
10
10
  from unstructured_ingest.v2.pipeline.steps.download import DownloaderT, DownloadStep
11
11
  from unstructured_ingest.v2.pipeline.steps.embed import Embedder, EmbedStep
12
+ from unstructured_ingest.v2.pipeline.steps.filter import Filterer, FilterStep
12
13
  from unstructured_ingest.v2.pipeline.steps.index import IndexerT, IndexStep
13
14
  from unstructured_ingest.v2.pipeline.steps.partition import Partitioner, PartitionStep
14
15
  from unstructured_ingest.v2.pipeline.steps.stage import UploadStager, UploadStageStep
15
16
  from unstructured_ingest.v2.pipeline.steps.uncompress import Uncompressor, UncompressStep
16
17
  from unstructured_ingest.v2.pipeline.steps.upload import Uploader, UploadStep
17
- from unstructured_ingest.v2.pipeline.utils import sterilize_dict
18
18
  from unstructured_ingest.v2.processes.chunker import ChunkerConfig
19
19
  from unstructured_ingest.v2.processes.connector_registry import (
20
20
  ConnectionConfig,
@@ -27,6 +27,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
27
27
  )
28
28
  from unstructured_ingest.v2.processes.connectors.local import LocalUploader
29
29
  from unstructured_ingest.v2.processes.embedder import EmbedderConfig
30
+ from unstructured_ingest.v2.processes.filter import FiltererConfig
30
31
  from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
31
32
 
32
33
 
@@ -37,22 +38,33 @@ class PipelineError(Exception):
37
38
  @dataclass
38
39
  class Pipeline:
39
40
  context: ProcessorConfig
41
+
40
42
  indexer: InitVar[IndexerT]
41
43
  indexer_step: IndexStep = field(init=False)
44
+
42
45
  downloader: InitVar[DownloaderT]
43
46
  downloader_step: DownloadStep = field(init=False)
47
+
44
48
  partitioner: InitVar[Partitioner]
45
49
  partitioner_step: PartitionStep = field(init=False)
50
+
46
51
  chunker: InitVar[Optional[Chunker]] = None
47
52
  chunker_step: ChunkStep = field(init=False, default=None)
53
+
48
54
  embedder: InitVar[Optional[Embedder]] = None
49
55
  embedder_step: EmbedStep = field(init=False, default=None)
56
+
50
57
  stager: InitVar[Optional[UploadStager]] = None
51
58
  stager_step: UploadStageStep = field(init=False, default=None)
59
+
52
60
  uploader: InitVar[Uploader] = field(default=LocalUploader())
53
61
  uploader_step: UploadStep = field(init=False, default=None)
62
+
54
63
  uncompress_step: UncompressStep = field(init=False, default=None)
55
64
 
65
+ filterer: InitVar[Optional[Filterer]] = None
66
+ filter_step: FilterStep = field(init=False, default=None)
67
+
56
68
  def __post_init__(
57
69
  self,
58
70
  indexer: IndexerT,
@@ -62,10 +74,12 @@ class Pipeline:
62
74
  embedder: Embedder = None,
63
75
  stager: UploadStager = None,
64
76
  uploader: Uploader = None,
77
+ filterer: Filterer = None,
65
78
  ):
66
79
  make_default_logger(level=logging.DEBUG if self.context.verbose else logging.INFO)
67
80
  self.indexer_step = IndexStep(process=indexer, context=self.context)
68
81
  self.downloader_step = DownloadStep(process=downloader, context=self.context)
82
+ self.filter_step = FilterStep(process=filterer, context=self.context) if filterer else None
69
83
  self.partitioner_step = PartitionStep(process=partitioner, context=self.context)
70
84
  self.chunker_step = ChunkStep(process=chunker, context=self.context) if chunker else None
71
85
 
@@ -109,6 +123,7 @@ class Pipeline:
109
123
  def run(self):
110
124
  try:
111
125
  start_time = time()
126
+ self._run_prechecks()
112
127
  self._run()
113
128
  logger.info(f"Finished ingest process in {time() - start_time}s")
114
129
  finally:
@@ -130,11 +145,39 @@ class Pipeline:
130
145
  final = [f for f in flat if f]
131
146
  return final or None
132
147
 
148
+ def _run_prechecks(self):
149
+ steps = [self.indexer_step, self.downloader_step, self.partitioner_step, self.uploader_step]
150
+ if self.chunker_step:
151
+ steps.append(self.chunker_step)
152
+ if self.embedder_step:
153
+ steps.append(self.embedder_step)
154
+ if self.uncompress_step:
155
+ steps.append(self.uncompress_step)
156
+ if self.stager_step:
157
+ steps.append(self.stager_step)
158
+ failures = {}
159
+ for step in steps:
160
+ try:
161
+ step.process.precheck()
162
+ except Exception as e:
163
+ failures[step.process.__class__.__name__] = f"[{type(e).__name__}] {e}"
164
+ if failures:
165
+ for k, v in failures.items():
166
+ logger.error(f"Step precheck failure: {k}: {v}")
167
+ raise PipelineError("Precheck failed")
168
+
169
+ def apply_filter(self, records: list[dict]) -> list[dict]:
170
+ if not self.filter_step:
171
+ return records
172
+ data_to_filter = [{"file_data_path": i["file_data_path"]} for i in records]
173
+ filtered_data = self.filter_step(data_to_filter)
174
+ filtered_data = [f for f in filtered_data if f is not None]
175
+ filtered_file_data_paths = [r["file_data_path"] for r in filtered_data]
176
+ filtered_records = [r for r in records if r["file_data_path"] in filtered_file_data_paths]
177
+ return filtered_records
178
+
133
179
  def _run(self):
134
- logger.info(
135
- f"Running local pipline: {self} with configs: "
136
- f"{sterilize_dict(self.context.to_dict(redact_sensitive=True))}"
137
- )
180
+ logger.info(f"Running local pipline: {self} with configs: " f"{self.context.json()}")
138
181
  if self.context.mp_supported:
139
182
  manager = mp.Manager()
140
183
  self.context.status = manager.dict()
@@ -147,18 +190,33 @@ class Pipeline:
147
190
  if not indices_inputs:
148
191
  return
149
192
 
193
+ # Initial filtering on indexed content
194
+ indices_inputs = self.apply_filter(records=indices_inputs)
195
+ if not indices_inputs:
196
+ return
197
+
150
198
  # Download associated content to local file system
151
199
  downloaded_data = self.downloader_step(indices_inputs)
152
200
  downloaded_data = self.clean_results(results=downloaded_data)
153
201
  if not downloaded_data:
154
202
  return
155
203
 
204
+ # Post download filtering
205
+ downloaded_data = self.apply_filter(records=downloaded_data)
206
+ if not downloaded_data:
207
+ return
208
+
156
209
  # Run uncompress if available
157
210
  if self.uncompress_step:
158
211
  downloaded_data = self.uncompress_step(downloaded_data)
159
212
  # Flatten list of lists
160
213
  downloaded_data = self.clean_results(results=downloaded_data)
161
214
 
215
+ # Post uncompress filtering
216
+ downloaded_data = self.apply_filter(records=downloaded_data)
217
+ if not downloaded_data:
218
+ return
219
+
162
220
  if not downloaded_data:
163
221
  return
164
222
 
@@ -179,9 +237,14 @@ class Pipeline:
179
237
  self.uploader_step(iterable=elements)
180
238
 
181
239
  def __str__(self):
182
- s = [str(self.indexer_step), str(self.downloader_step)]
240
+ s = [str(self.indexer_step)]
241
+ if filter_step := self.filter_step:
242
+ s.append(str(filter_step))
243
+ s.append(str(self.downloader_step))
244
+ if filter_step := self.filter_step:
245
+ s.append(str(filter_step))
183
246
  if uncompress_step := self.uncompress_step:
184
- s.append(str(uncompress_step))
247
+ s.extend([str(uncompress_step), str(filter_step)])
185
248
  s.append(str(self.partitioner_step))
186
249
  if chunker_step := self.chunker_step:
187
250
  s.append(str(chunker_step))
@@ -200,6 +263,7 @@ class Pipeline:
200
263
  downloader_config: DownloaderConfigT,
201
264
  source_connection_config: ConnectionConfig,
202
265
  partitioner_config: PartitionerConfig,
266
+ filterer_config: FiltererConfig = None,
203
267
  chunker_config: Optional[ChunkerConfig] = None,
204
268
  embedder_config: Optional[EmbedderConfig] = None,
205
269
  destination_connection_config: Optional[ConnectionConfig] = None,
@@ -235,6 +299,8 @@ class Pipeline:
235
299
  ),
236
300
  "partitioner": Partitioner(config=partitioner_config),
237
301
  }
302
+ if filterer_config:
303
+ pipeline_kwargs["filterer"] = Filterer(config=filterer_config)
238
304
  if chunker_config:
239
305
  pipeline_kwargs["chunker"] = Chunker(config=chunker_config)
240
306
  if embedder_config:
@@ -5,13 +5,11 @@ from dataclasses import dataclass
5
5
  from pathlib import Path
6
6
  from typing import Callable, Optional, TypedDict
7
7
 
8
- from unstructured.staging.base import elements_to_dicts
9
-
10
8
  from unstructured_ingest.v2.interfaces import FileData
11
9
  from unstructured_ingest.v2.logger import logger
12
10
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
13
- from unstructured_ingest.v2.pipeline.utils import sterilize_dict
14
11
  from unstructured_ingest.v2.processes.chunker import Chunker
12
+ from unstructured_ingest.v2.utils import serialize_base_model_json
15
13
 
16
14
  STEP_ID = "chunk"
17
15
 
@@ -30,11 +28,7 @@ class ChunkStep(PipelineStep):
30
28
  return f"{self.identifier} ({self.process.config.chunking_strategy})"
31
29
 
32
30
  def __post_init__(self):
33
- config = (
34
- sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
35
- if self.process.config
36
- else None
37
- )
31
+ config = self.process.config.json() if self.process.config else None
38
32
  logger.info(f"Created {self.identifier} with configs: {config}")
39
33
 
40
34
  def should_chunk(self, filepath: Path, file_data: FileData) -> bool:
@@ -72,13 +66,13 @@ class ChunkStep(PipelineStep):
72
66
  chunked_content_raw = await fn(**fn_kwargs)
73
67
  self._save_output(
74
68
  output_filepath=str(output_filepath),
75
- chunked_content=elements_to_dicts(chunked_content_raw),
69
+ chunked_content=chunked_content_raw,
76
70
  )
77
71
  return ChunkStepResponse(file_data_path=file_data_path, path=str(output_filepath))
78
72
 
79
73
  def get_hash(self, extras: Optional[list[str]]) -> str:
80
- hashable_string = json.dumps(
81
- self.process.config.to_dict(), sort_keys=True, ensure_ascii=True
74
+ hashable_string = serialize_base_model_json(
75
+ model=self.process.config, sort_keys=True, ensure_ascii=True
82
76
  )
83
77
  if extras:
84
78
  hashable_string += "".join(extras)
@@ -2,13 +2,14 @@ import asyncio
2
2
  import hashlib
3
3
  import json
4
4
  from dataclasses import dataclass
5
+ from pathlib import Path
5
6
  from typing import Callable, Optional, TypedDict, TypeVar
6
7
 
7
8
  from unstructured_ingest.v2.interfaces import FileData, download_responses
8
9
  from unstructured_ingest.v2.interfaces.downloader import Downloader
9
10
  from unstructured_ingest.v2.logger import logger
10
11
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
11
- from unstructured_ingest.v2.pipeline.utils import sterilize_dict
12
+ from unstructured_ingest.v2.utils import serialize_base_model_json
12
13
 
13
14
  DownloaderT = TypeVar("DownloaderT", bound=Downloader)
14
15
 
@@ -29,15 +30,9 @@ class DownloadStep(PipelineStep):
29
30
  return f"{self.identifier} ({self.process.__class__.__name__})"
30
31
 
31
32
  def __post_init__(self):
32
- config = (
33
- sterilize_dict(self.process.download_config.to_dict(redact_sensitive=True))
34
- if self.process.download_config
35
- else None
36
- )
33
+ config = self.process.download_config.json() if self.process.download_config else None
37
34
  connection_config = (
38
- sterilize_dict(self.process.connection_config.to_dict(redact_sensitive=True))
39
- if self.process.connection_config
40
- else None
35
+ self.process.connection_config.json() if self.process.connection_config else None
41
36
  )
42
37
  logger.info(
43
38
  f"Created {self.identifier} with configs: {config}, "
@@ -70,11 +65,40 @@ class DownloadStep(PipelineStep):
70
65
  return True
71
66
  return False
72
67
 
68
+ def update_file_data(
69
+ self, file_data: FileData, file_data_path: Path, download_path: Path
70
+ ) -> None:
71
+ file_size_bytes = download_path.stat().st_size
72
+ changed = False
73
+ if not file_data.metadata.filesize_bytes and file_size_bytes:
74
+ changed = True
75
+ file_data.metadata.filesize_bytes = file_size_bytes
76
+ if (
77
+ file_data.metadata.filesize_bytes
78
+ and file_data.metadata.filesize_bytes != file_size_bytes
79
+ ):
80
+ logger.warning(
81
+ f"file size in original file data "
82
+ f"({file_data.metadata.filesize_bytes}) doesn't "
83
+ f"match size of local file: {file_size_bytes}, updating"
84
+ )
85
+ changed = True
86
+ file_data.metadata.filesize_bytes = file_size_bytes
87
+ if changed:
88
+ logger.debug(f"Updating file data with new content: {file_data.to_dict()}")
89
+ with file_data_path.open("w") as file:
90
+ json.dump(file_data.to_dict(), file, indent=2)
91
+
73
92
  async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
74
93
  file_data = FileData.from_file(path=file_data_path)
75
94
  download_path = self.process.get_download_path(file_data=file_data)
76
95
  if not self.should_download(file_data=file_data, file_data_path=file_data_path):
77
96
  logger.debug(f"Skipping download, file already exists locally: {download_path}")
97
+ self.update_file_data(
98
+ file_data=file_data,
99
+ file_data_path=Path(file_data_path),
100
+ download_path=download_path,
101
+ )
78
102
  return [DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))]
79
103
  fn_kwargs = {"file_data": file_data}
80
104
  if not asyncio.iscoroutinefunction(fn):
@@ -85,26 +109,60 @@ class DownloadStep(PipelineStep):
85
109
  else:
86
110
  download_results = await fn(**fn_kwargs)
87
111
  return self.create_step_results(
88
- current_file_data_path=file_data_path, download_results=download_results
112
+ current_file_data_path=file_data_path,
113
+ download_results=download_results,
114
+ current_file_data=file_data,
89
115
  )
90
116
 
91
117
  def create_step_results(
92
- self, current_file_data_path: str, download_results: download_responses
118
+ self,
119
+ current_file_data_path: str,
120
+ current_file_data: FileData,
121
+ download_results: download_responses,
93
122
  ) -> list[DownloadStepResponse]:
123
+ responses = []
94
124
  if not isinstance(download_results, list):
95
- return [
96
- DownloadStepResponse(
97
- file_data_path=current_file_data_path, path=str(download_results["path"])
125
+ file_data = current_file_data
126
+ file_data_path = current_file_data_path
127
+ download_path = download_results["path"]
128
+ if download_results["file_data"].identifier == current_file_data.identifier:
129
+ self.update_file_data(
130
+ file_data=file_data,
131
+ file_data_path=Path(file_data_path),
132
+ download_path=download_path,
133
+ )
134
+ responses = [
135
+ DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))
136
+ ]
137
+ else:
138
+ file_data = download_results["file_data"]
139
+ file_data_path = self.persist_new_file_data(file_data=file_data)
140
+ self.update_file_data(
141
+ file_data=file_data,
142
+ file_data_path=Path(file_data_path),
143
+ download_path=download_path,
98
144
  )
99
- ]
145
+ responses = [
146
+ DownloadStepResponse(
147
+ file_data_path=current_file_data_path, path=str(download_results["path"])
148
+ )
149
+ ]
150
+ else:
100
151
  # Supplemental results generated as part of the download process
101
- download_step_results = []
102
- for res in download_results:
103
- file_data_path = self.persist_new_file_data(file_data=res["file_data"])
104
- download_step_results.append(
105
- DownloadStepResponse(file_data_path=file_data_path, path=res["path"])
106
- )
107
- return download_step_results
152
+ for res in download_results:
153
+ file_data = res["file_data"]
154
+ file_data_path = self.persist_new_file_data(file_data=file_data)
155
+ download_path = res["path"]
156
+ self.update_file_data(
157
+ file_data=file_data,
158
+ file_data_path=Path(file_data_path),
159
+ download_path=download_path,
160
+ )
161
+ responses.append(
162
+ DownloadStepResponse(file_data_path=file_data_path, path=res["path"])
163
+ )
164
+
165
+ return responses
108
166
 
109
167
  def persist_new_file_data(self, file_data: FileData) -> str:
110
168
  record_hash = self.get_hash(extras=[file_data.identifier])
@@ -116,9 +174,17 @@ class DownloadStep(PipelineStep):
116
174
  return str(filepath)
117
175
 
118
176
  def get_hash(self, extras: Optional[list[str]]) -> str:
119
- hashable_string = json.dumps(
120
- sterilize_dict(self.process.download_config.to_dict()), sort_keys=True
177
+ download_config_dict = json.loads(
178
+ serialize_base_model_json(model=self.process.download_config)
179
+ )
180
+ connection_config_dict = json.loads(
181
+ serialize_base_model_json(model=self.process.connection_config)
121
182
  )
183
+ hashable_dict = {
184
+ "download_config": download_config_dict,
185
+ "connection_config": connection_config_dict,
186
+ }
187
+ hashable_string = json.dumps(hashable_dict, sort_keys=True)
122
188
  if extras:
123
189
  hashable_string += "".join(extras)
124
190
  return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
@@ -5,13 +5,11 @@ from dataclasses import dataclass
5
5
  from pathlib import Path
6
6
  from typing import Callable, Optional, TypedDict
7
7
 
8
- from unstructured.staging.base import elements_to_dicts
9
-
10
8
  from unstructured_ingest.v2.interfaces import FileData
11
9
  from unstructured_ingest.v2.logger import logger
12
10
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
13
- from unstructured_ingest.v2.pipeline.utils import sterilize_dict
14
11
  from unstructured_ingest.v2.processes.embedder import Embedder
12
+ from unstructured_ingest.v2.utils import serialize_base_model_json
15
13
 
16
14
  STEP_ID = "embed"
17
15
 
@@ -30,11 +28,7 @@ class EmbedStep(PipelineStep):
30
28
  return f"{self.identifier} ({self.process.config.embedding_provider})"
31
29
 
32
30
  def __post_init__(self):
33
- config = (
34
- sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
35
- if self.process.config
36
- else None
37
- )
31
+ config = self.process.config.json() if self.process.config else None
38
32
  logger.info(f"Created {self.identifier} with configs: {config}")
39
33
 
40
34
  def should_embed(self, filepath: Path, file_data: FileData) -> bool:
@@ -71,13 +65,13 @@ class EmbedStep(PipelineStep):
71
65
 
72
66
  self._save_output(
73
67
  output_filepath=str(output_filepath),
74
- embedded_content=elements_to_dicts(embed_content_raw),
68
+ embedded_content=embed_content_raw,
75
69
  )
76
70
  return EmbedStepResponse(file_data_path=file_data_path, path=str(output_filepath))
77
71
 
78
72
  def get_hash(self, extras: Optional[list[str]]) -> str:
79
- hashable_string = json.dumps(
80
- self.process.config.to_dict(), sort_keys=True, ensure_ascii=True
73
+ hashable_string = serialize_base_model_json(
74
+ model=self.process.config, sort_keys=True, ensure_ascii=True
81
75
  )
82
76
  if extras:
83
77
  hashable_string += "".join(extras)
@@ -0,0 +1,35 @@
1
+ import asyncio
2
+ from dataclasses import dataclass
3
+ from typing import Callable, Optional
4
+
5
+ from unstructured_ingest.v2.interfaces.file_data import FileData
6
+ from unstructured_ingest.v2.logger import logger
7
+ from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
8
+ from unstructured_ingest.v2.processes.filter import Filterer
9
+
10
+ STEP_ID = "filter"
11
+
12
+
13
+ @dataclass
14
+ class FilterStep(PipelineStep):
15
+ process: Filterer
16
+ identifier: str = STEP_ID
17
+
18
+ def __post_init__(self):
19
+ config = self.process.config.json() if self.process.config else None
20
+ logger.info(f"Created {self.identifier} with configs: {config}")
21
+
22
+ async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:
23
+ file_data = FileData.from_file(path=file_data_path)
24
+ fn_kwargs = {"file_data": file_data}
25
+ if not asyncio.iscoroutinefunction(fn):
26
+ resp = fn(**fn_kwargs)
27
+ elif semaphore := self.context.semaphore:
28
+ async with semaphore:
29
+ resp = await fn(**fn_kwargs)
30
+ else:
31
+ resp = await fn(**fn_kwargs)
32
+
33
+ if resp:
34
+ return {"file_data_path": file_data_path}
35
+ return None
@@ -6,7 +6,7 @@ from typing import Generator, Optional, TypeVar
6
6
  from unstructured_ingest.v2.interfaces.indexer import Indexer
7
7
  from unstructured_ingest.v2.logger import logger
8
8
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
9
- from unstructured_ingest.v2.pipeline.utils import sterilize_dict
9
+ from unstructured_ingest.v2.utils import serialize_base_model_json
10
10
 
11
11
  IndexerT = TypeVar("IndexerT", bound=Indexer)
12
12
 
@@ -22,15 +22,9 @@ class IndexStep(PipelineStep):
22
22
  return f"{self.identifier} ({self.process.__class__.__name__})"
23
23
 
24
24
  def __post_init__(self):
25
- config = (
26
- sterilize_dict(self.process.index_config.to_dict(redact_sensitive=True))
27
- if self.process.index_config
28
- else None
29
- )
25
+ config = self.process.index_config.json() if self.process.index_config else None
30
26
  connection_config = (
31
- sterilize_dict(self.process.connection_config.to_dict(redact_sensitive=True))
32
- if self.process.connection_config
33
- else None
27
+ self.process.connection_config.json() if self.process.connection_config else None
34
28
  )
35
29
  logger.info(
36
30
  f"Created {self.identifier} with configs: {config}, "
@@ -55,7 +49,17 @@ class IndexStep(PipelineStep):
55
49
  continue
56
50
 
57
51
  def get_hash(self, extras: Optional[list[str]]) -> str:
58
- hashable_string = json.dumps(self.process.index_config.to_dict())
52
+ index_config_dict = json.loads(
53
+ serialize_base_model_json(model=self.process.index_config, sort_keys=True)
54
+ )
55
+ connection_config_dict = json.loads(
56
+ serialize_base_model_json(model=self.process.connection_config, sort_keys=True)
57
+ )
58
+ hashable_dict = {
59
+ "index_config": index_config_dict,
60
+ "connection_config": connection_config_dict,
61
+ }
62
+ hashable_string = json.dumps(hashable_dict, sort_keys=True)
59
63
  if extras:
60
64
  hashable_string += "".join(extras)
61
65
  return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
@@ -8,8 +8,8 @@ from typing import Callable, Optional, TypedDict
8
8
  from unstructured_ingest.v2.interfaces import FileData
9
9
  from unstructured_ingest.v2.logger import logger
10
10
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
11
- from unstructured_ingest.v2.pipeline.utils import sterilize_dict
12
11
  from unstructured_ingest.v2.processes.partitioner import Partitioner
12
+ from unstructured_ingest.v2.utils import serialize_base_model_json
13
13
 
14
14
  STEP_ID = "partition"
15
15
 
@@ -28,7 +28,7 @@ class PartitionStep(PipelineStep):
28
28
  return f"{self.identifier} ({self.process.config.strategy})"
29
29
 
30
30
  def __post_init__(self):
31
- config = sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
31
+ config = self.process.config.json()
32
32
  logger.info(f"Created {self.identifier} with configs: {config}")
33
33
 
34
34
  def should_partition(self, filepath: Path, file_data: FileData) -> bool:
@@ -56,7 +56,7 @@ class PartitionStep(PipelineStep):
56
56
  if not self.should_partition(filepath=output_filepath, file_data=file_data):
57
57
  logger.debug(f"Skipping partitioning, output already exists: {output_filepath}")
58
58
  return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
59
- fn_kwargs = {"filename": path, "metadata": file_data.metadata}
59
+ fn_kwargs = {"filename": path, "metadata": file_data.metadata.to_dict()}
60
60
  if not asyncio.iscoroutinefunction(fn):
61
61
  partitioned_content = fn(**fn_kwargs)
62
62
  elif semaphore := self.context.semaphore:
@@ -70,8 +70,8 @@ class PartitionStep(PipelineStep):
70
70
  return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
71
71
 
72
72
  def get_hash(self, extras: Optional[list[str]]) -> str:
73
- hashable_string = json.dumps(
74
- self.process.config.to_dict(), sort_keys=True, ensure_ascii=True
73
+ hashable_string = serialize_base_model_json(
74
+ model=self.process.config, sort_keys=True, ensure_ascii=True
75
75
  )
76
76
  if extras:
77
77
  hashable_string += "".join(extras)
@@ -1,6 +1,5 @@
1
1
  import asyncio
2
2
  import hashlib
3
- import json
4
3
  from dataclasses import dataclass
5
4
  from pathlib import Path
6
5
  from typing import Callable, Optional, TypedDict
@@ -9,7 +8,7 @@ from unstructured_ingest.v2.interfaces.file_data import FileData
9
8
  from unstructured_ingest.v2.interfaces.upload_stager import UploadStager
10
9
  from unstructured_ingest.v2.logger import logger
11
10
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
12
- from unstructured_ingest.v2.pipeline.utils import sterilize_dict
11
+ from unstructured_ingest.v2.utils import serialize_base_model_json
13
12
 
14
13
  STEP_ID = "upload_stage"
15
14
 
@@ -29,9 +28,7 @@ class UploadStageStep(PipelineStep):
29
28
 
30
29
  def __post_init__(self):
31
30
  config = (
32
- sterilize_dict(self.process.upload_stager_config.to_dict(redact_sensitive=True))
33
- if self.process.upload_stager_config
34
- else None
31
+ self.process.upload_stager_config.json() if self.process.upload_stager_config else None
35
32
  )
36
33
  self.cache_dir.mkdir(parents=True, exist_ok=True)
37
34
  logger.info(f"Created {self.identifier} with configs: {config}")
@@ -56,8 +53,8 @@ class UploadStageStep(PipelineStep):
56
53
  return UploadStageStepResponse(file_data_path=file_data_path, path=str(staged_output_path))
57
54
 
58
55
  def get_hash(self, extras: Optional[list[str]]) -> str:
59
- hashable_string = json.dumps(
60
- self.process.upload_stager_config.to_dict(), sort_keys=True, ensure_ascii=True
56
+ hashable_string = serialize_base_model_json(
57
+ model=self.process.upload_stager_config, sort_keys=True, ensure_ascii=True
61
58
  )
62
59
  if extras:
63
60
  hashable_string += "".join(extras)
@@ -5,7 +5,6 @@ from typing import Callable, TypedDict
5
5
  from unstructured_ingest.v2.interfaces.file_data import FileData
6
6
  from unstructured_ingest.v2.logger import logger
7
7
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
8
- from unstructured_ingest.v2.pipeline.utils import sterilize_dict
9
8
  from unstructured_ingest.v2.processes.uncompress import Uncompressor
10
9
 
11
10
  STEP_ID = "uncompress"
@@ -21,11 +20,7 @@ class UncompressStep(PipelineStep):
21
20
  identifier: str = STEP_ID
22
21
 
23
22
  def __post_init__(self):
24
- config = (
25
- sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
26
- if self.process.config
27
- else None
28
- )
23
+ config = self.process.config.json() if self.process.config else None
29
24
  logger.info(f"Created {self.identifier} with configs: {config}")
30
25
 
31
26
  def _run(self, path: str, file_data_path: str) -> list[UncompressStepResponse]:
@@ -7,7 +7,6 @@ from unstructured_ingest.v2.interfaces import FileData
7
7
  from unstructured_ingest.v2.interfaces.uploader import UploadContent, Uploader
8
8
  from unstructured_ingest.v2.logger import logger
9
9
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep, iterable_input, timed
10
- from unstructured_ingest.v2.pipeline.utils import sterilize_dict
11
10
 
12
11
  STEP_ID = "upload"
13
12
 
@@ -26,15 +25,9 @@ class UploadStep(PipelineStep):
26
25
  return f"{self.identifier} ({self.process.__class__.__name__})"
27
26
 
28
27
  def __post_init__(self):
29
- config = (
30
- sterilize_dict(self.process.upload_config.to_dict(redact_sensitive=True))
31
- if self.process.upload_config
32
- else None
33
- )
28
+ config = self.process.upload_config.json() if self.process.upload_config else None
34
29
  connection_config = (
35
- sterilize_dict(self.process.connection_config.to_dict(redact_sensitive=True))
36
- if self.process.connection_config
37
- else None
30
+ self.process.connection_config.json() if self.process.connection_config else None
38
31
  )
39
32
  logger.info(
40
33
  f"Created {self.identifier} with configs: {config}, "
@@ -0,0 +1,18 @@
1
+ from .chunker import Chunker, ChunkerConfig
2
+ from .embedder import Embedder, EmbedderConfig
3
+ from .filter import Filterer, FiltererConfig
4
+ from .partitioner import Partitioner, PartitionerConfig
5
+ from .uncompress import UncompressConfig, Uncompressor
6
+
7
+ __all__ = [
8
+ "Chunker",
9
+ "ChunkerConfig",
10
+ "Embedder",
11
+ "EmbedderConfig",
12
+ "Filterer",
13
+ "FiltererConfig",
14
+ "Partitioner",
15
+ "PartitionerConfig",
16
+ "Uncompressor",
17
+ "UncompressConfig",
18
+ ]