unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,408 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import logging
5
+ import multiprocessing as mp
6
+ import shutil
7
+ from dataclasses import InitVar, dataclass, field
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ from unstructured_ingest.interfaces import ProcessorConfig, Uploader
12
+ from unstructured_ingest.logger import logger, make_default_logger
13
+ from unstructured_ingest.otel import OtelHandler
14
+ from unstructured_ingest.pipeline.interfaces import PipelineStep
15
+ from unstructured_ingest.pipeline.steps.chunk import Chunker, ChunkStep
16
+ from unstructured_ingest.pipeline.steps.download import DownloaderT, DownloadStep
17
+ from unstructured_ingest.pipeline.steps.embed import Embedder, EmbedStep
18
+ from unstructured_ingest.pipeline.steps.filter import Filterer, FilterStep
19
+ from unstructured_ingest.pipeline.steps.index import IndexerT, IndexStep
20
+ from unstructured_ingest.pipeline.steps.partition import Partitioner, PartitionStep
21
+ from unstructured_ingest.pipeline.steps.stage import UploadStager, UploadStageStep
22
+ from unstructured_ingest.pipeline.steps.uncompress import Uncompressor, UncompressStep
23
+ from unstructured_ingest.pipeline.steps.upload import UploadStep
24
+ from unstructured_ingest.processes.chunker import ChunkerConfig
25
+ from unstructured_ingest.processes.connector_registry import (
26
+ ConnectionConfig,
27
+ DownloaderConfigT,
28
+ IndexerConfigT,
29
+ UploaderConfigT,
30
+ UploadStagerConfigT,
31
+ destination_registry,
32
+ source_registry,
33
+ )
34
+ from unstructured_ingest.processes.connectors.local import LocalUploader
35
+ from unstructured_ingest.processes.embedder import EmbedderConfig
36
+ from unstructured_ingest.processes.filter import FiltererConfig
37
+ from unstructured_ingest.processes.partitioner import PartitionerConfig
38
+
39
+
40
+ class PipelineError(Exception):
41
+ pass
42
+
43
+
44
+ @dataclass
45
+ class Pipeline:
46
+ context: ProcessorConfig
47
+
48
+ indexer: InitVar[IndexerT]
49
+ indexer_step: IndexStep = field(init=False)
50
+
51
+ downloader: InitVar[DownloaderT]
52
+ downloader_step: DownloadStep = field(init=False)
53
+
54
+ partitioner: InitVar[Partitioner]
55
+ partitioner_step: PartitionStep = field(init=False)
56
+
57
+ chunker: InitVar[Chunker | None] = None
58
+ chunker_step: ChunkStep | None = field(init=False, default=None)
59
+
60
+ embedder: InitVar[Embedder | None] = None
61
+ embedder_step: EmbedStep | None = field(init=False, default=None)
62
+
63
+ stager: InitVar[UploadStager | None] = None
64
+ stager_step: UploadStageStep | None = field(init=False, default=None)
65
+
66
+ uploader: InitVar[Uploader] = field(default=LocalUploader())
67
+ uploader_step: UploadStep | None = field(init=False, default=None)
68
+
69
+ uncompress_step: UncompressStep | None = field(init=False, default=None)
70
+
71
+ filterer: InitVar[Filterer | None] = None
72
+ filter_step: FilterStep | None = field(init=False, default=None)
73
+
74
+ def __post_init__(
75
+ self,
76
+ indexer: IndexerT,
77
+ downloader: DownloaderT,
78
+ partitioner: Partitioner,
79
+ chunker: Chunker | None = None,
80
+ embedder: Embedder | None = None,
81
+ stager: UploadStager | None = None,
82
+ uploader: Uploader | None = None,
83
+ filterer: Filterer | None = None,
84
+ ):
85
+ make_default_logger(level=logging.DEBUG if self.context.verbose else logging.INFO)
86
+ otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint)
87
+ otel_handler.init_trace()
88
+ self.indexer_step = IndexStep(process=indexer, context=self.context)
89
+ self.downloader_step = DownloadStep(process=downloader, context=self.context)
90
+ self.filter_step = FilterStep(process=filterer, context=self.context) if filterer else None
91
+ self.partitioner_step = PartitionStep(process=partitioner, context=self.context)
92
+ self.chunker_step = ChunkStep(process=chunker, context=self.context) if chunker else None
93
+
94
+ self.embedder_step = EmbedStep(process=embedder, context=self.context) if embedder else None
95
+
96
+ self.stager_step = UploadStageStep(process=stager, context=self.context) if stager else None
97
+ self.uploader_step = UploadStep(process=uploader, context=self.context)
98
+ if self.context.uncompress:
99
+ process = Uncompressor()
100
+ self.uncompress_step = UncompressStep(process=process, context=self.context)
101
+
102
+ self.check_destination_connector()
103
+
104
+ def check_destination_connector(self):
105
+ # Make sure that if the set destination connector expects a stager, one is also set
106
+ if not self.uploader_step:
107
+ return
108
+ uploader_connector_type = self.uploader_step.process.connector_type
109
+ registry_entry = destination_registry[uploader_connector_type]
110
+ if registry_entry.upload_stager and self.stager_step is None:
111
+ try:
112
+ self.stager_step = UploadStageStep(
113
+ process=registry_entry.upload_stager(), context=self.context
114
+ )
115
+ return
116
+ except Exception as e:
117
+ logger.debug(f"failed to instantiate required stager on user's behalf: {e}")
118
+ raise ValueError(
119
+ f"pipeline with uploader type {self.uploader_step.process.__class__.__name__} "
120
+ f"expects a stager of type {registry_entry.upload_stager.__name__} "
121
+ f"but one was not set"
122
+ )
123
+
124
+ def cleanup(self):
125
+ if self.context.delete_cache and Path(self.context.work_dir).exists():
126
+ logger.info(f"deleting cache directory: {self.context.work_dir}")
127
+ shutil.rmtree(self.context.work_dir)
128
+
129
+ def log_statuses(self):
130
+ if status := self.context.status:
131
+ logger.error(f"{len(status)} failed documents:")
132
+ for k, v in status.items():
133
+ for kk, vv in v.items():
134
+ logger.error(f"{k}: [{kk}] {vv}")
135
+
136
+ def _run_initialization(self):
137
+ failures = {}
138
+ init_kwargs = {}
139
+ for step in self._get_ordered_steps():
140
+ try:
141
+ step.process.init(**init_kwargs)
142
+ step.process.precheck()
143
+ # Make sure embedder dimensions available for downstream steps
144
+ if isinstance(step.process, Embedder):
145
+ embed_dimensions = step.process.config.get_embedder().dimension
146
+ init_kwargs["vector_length"] = embed_dimensions
147
+
148
+ except Exception as e:
149
+ failures[step.process.__class__.__name__] = f"[{type(e).__name__}] {e}"
150
+ if failures:
151
+ for k, v in failures.items():
152
+ logger.error(f"Step initialization failure: {k}: {v}")
153
+ raise PipelineError("Initialization failed")
154
+
155
+ def run(self):
156
+ otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=logger.info)
157
+ try:
158
+ with otel_handler.get_tracer().start_as_current_span(
159
+ "ingest process", record_exception=True
160
+ ):
161
+ self._run_initialization()
162
+ self._run()
163
+ finally:
164
+ self.log_statuses()
165
+ self.cleanup()
166
+ if self.context.status:
167
+ raise PipelineError("Pipeline did not run successfully")
168
+
169
+ def clean_results(self, results: list[Any | list[Any]] | None) -> list[Any] | None:
170
+ if not results:
171
+ return None
172
+ results = [r for r in results if r]
173
+ flat = []
174
+ for r in results:
175
+ if isinstance(r, list):
176
+ flat.extend(r)
177
+ else:
178
+ flat.append(r)
179
+ final = [f for f in flat if f]
180
+ return final or None
181
+
182
+ def _get_ordered_steps(self) -> list[PipelineStep]:
183
+ steps = [self.indexer_step, self.downloader_step]
184
+ if self.uncompress_step:
185
+ steps.append(self.uncompress_step)
186
+ steps.append(self.partitioner_step)
187
+ if self.chunker_step:
188
+ steps.append(self.chunker_step)
189
+ if self.embedder_step:
190
+ steps.append(self.embedder_step)
191
+ if self.stager_step:
192
+ steps.append(self.stager_step)
193
+ steps.append(self.uploader_step)
194
+ return steps
195
+
196
+ def apply_filter(self, records: list[dict]) -> list[dict]:
197
+ if not self.filter_step:
198
+ return records
199
+ data_to_filter = [{"file_data_path": i["file_data_path"]} for i in records]
200
+ filtered_data = self.filter_step(data_to_filter)
201
+ filtered_data = [f for f in filtered_data if f is not None]
202
+ filtered_file_data_paths = [r["file_data_path"] for r in filtered_data]
203
+ filtered_records = [r for r in records if r["file_data_path"] in filtered_file_data_paths]
204
+ return filtered_records
205
+
206
+ def get_indices(self) -> list[dict]:
207
+ if self.indexer_step.process.is_async():
208
+
209
+ async def run_async():
210
+ output = []
211
+ async for i in self.indexer_step.run_async():
212
+ output.append(i)
213
+ return output
214
+
215
+ indices = asyncio.run(run_async())
216
+ else:
217
+ indices = self.indexer_step.run()
218
+ indices_inputs = [{"file_data_path": i} for i in indices]
219
+ return indices_inputs
220
+
221
+ def _run(self):
222
+ logger.info(
223
+ f"running local pipeline: {self} with configs: {self.context.model_dump_json()}"
224
+ )
225
+ if self.context.mp_supported:
226
+ manager = mp.Manager()
227
+ self.context.status = manager.dict()
228
+ else:
229
+ self.context.status = {}
230
+
231
+ # Index into data source
232
+ indices_inputs = self.get_indices()
233
+ if not indices_inputs:
234
+ logger.info("No files to process after indexer, exiting")
235
+ return
236
+
237
+ # Initial filtering on indexed content
238
+ indices_inputs = self.apply_filter(records=indices_inputs)
239
+ if not indices_inputs:
240
+ logger.info("No files to process after filtering indexed content, exiting")
241
+ return
242
+
243
+ # Download associated content to local file system
244
+ downloaded_data = self.downloader_step(indices_inputs)
245
+ downloaded_data = self.clean_results(results=downloaded_data)
246
+ if not downloaded_data:
247
+ logger.info("No files to process after downloader, exiting")
248
+ return
249
+
250
+ # Post download filtering
251
+ downloaded_data = self.apply_filter(records=downloaded_data)
252
+ if not downloaded_data:
253
+ logger.info("No files to process after filtering downloaded content, exiting")
254
+ return
255
+
256
+ # Run uncompress if available
257
+ if self.uncompress_step:
258
+ downloaded_data = self.uncompress_step(downloaded_data)
259
+ # Flatten list of lists
260
+ downloaded_data = self.clean_results(results=downloaded_data)
261
+
262
+ # Post uncompress filtering
263
+ downloaded_data = self.apply_filter(records=downloaded_data)
264
+ if not downloaded_data:
265
+ logger.info("No files to process after filtering uncompressed content, exiting")
266
+ return
267
+
268
+ if not downloaded_data or self.context.download_only:
269
+ return
270
+
271
+ # Partition content
272
+ elements = self.partitioner_step(downloaded_data)
273
+ elements = self.clean_results(results=elements)
274
+ # Download data non longer needed, delete if possible
275
+ self.downloader_step.delete_cache()
276
+ elements = self.clean_results(results=elements)
277
+ if not elements:
278
+ logger.info("No files to process after partitioning, exiting")
279
+ return
280
+
281
+ # Run element specific modifiers
282
+ last_step = self.partitioner_step
283
+ for step in [s for s in [self.chunker_step, self.embedder_step, self.stager_step] if s]:
284
+ elements = step(elements)
285
+ elements = self.clean_results(results=elements)
286
+ # Delete data from previous step if possible since no longer needed
287
+ last_step.delete_cache()
288
+ last_step = step
289
+ if not elements:
290
+ logger.info(f"no files to process after {step.__class__.__name__}, exiting")
291
+ return
292
+
293
+ # Upload the final result
294
+ self.uploader_step(iterable=elements)
295
+ last_step.delete_cache()
296
+
297
+ def __str__(self):
298
+ s = [str(self.indexer_step)]
299
+ if filter_step := self.filter_step:
300
+ s.append(str(filter_step))
301
+ s.append(str(self.downloader_step))
302
+ if filter_step := self.filter_step:
303
+ s.append(str(filter_step))
304
+ if uncompress_step := self.uncompress_step:
305
+ s.extend([str(uncompress_step), str(filter_step)])
306
+ s.append(str(self.partitioner_step))
307
+ if chunker_step := self.chunker_step:
308
+ s.append(str(chunker_step))
309
+ if embedder_step := self.embedder_step:
310
+ s.append(str(embedder_step))
311
+ if stager_step := self.stager_step:
312
+ s.append(str(stager_step))
313
+ s.append(str(self.uploader_step))
314
+ return " -> ".join(s)
315
+
316
+ @classmethod
317
+ def from_configs(
318
+ cls,
319
+ context: ProcessorConfig,
320
+ indexer_config: IndexerConfigT,
321
+ downloader_config: DownloaderConfigT,
322
+ source_connection_config: ConnectionConfig,
323
+ partitioner_config: PartitionerConfig,
324
+ filterer_config: FiltererConfig | None = None,
325
+ chunker_config: ChunkerConfig | None = None,
326
+ embedder_config: EmbedderConfig | None = None,
327
+ destination_connection_config: ConnectionConfig | None = None,
328
+ stager_config: UploadStagerConfigT | None = None,
329
+ uploader_config: UploaderConfigT | None = None,
330
+ ) -> "Pipeline":
331
+ # Get registry key based on indexer config
332
+ source_entry = {
333
+ k: v
334
+ for k, v in source_registry.items()
335
+ if type(indexer_config) is v.indexer_config
336
+ and type(downloader_config) is v.downloader_config
337
+ and type(source_connection_config) is v.connection_config
338
+ }
339
+ if len(source_entry) > 1:
340
+ raise ValueError(
341
+ f"multiple entries found matching provided indexer, "
342
+ f"downloader and connection configs: {source_entry}"
343
+ )
344
+ if len(source_entry) != 1:
345
+ raise ValueError(
346
+ "no entry found in source registry with matching indexer, "
347
+ "downloader and connection configs"
348
+ )
349
+ source = list(source_entry.values())[0]
350
+ pipeline_kwargs = {
351
+ "context": context,
352
+ "indexer": source.indexer(
353
+ index_config=indexer_config, connection_config=source_connection_config
354
+ ),
355
+ "downloader": source.downloader(
356
+ download_config=downloader_config, connection_config=source_connection_config
357
+ ),
358
+ "partitioner": Partitioner(config=partitioner_config),
359
+ }
360
+ if filterer_config:
361
+ pipeline_kwargs["filterer"] = Filterer(config=filterer_config)
362
+ if chunker_config:
363
+ pipeline_kwargs["chunker"] = Chunker(config=chunker_config)
364
+ if embedder_config:
365
+ pipeline_kwargs["embedder"] = Embedder(config=embedder_config)
366
+ if not uploader_config:
367
+ return Pipeline(**pipeline_kwargs)
368
+
369
+ destination_entry = {
370
+ k: v
371
+ for k, v in destination_registry.items()
372
+ if isinstance(uploader_config, v.uploader_config)
373
+ }
374
+ if destination_connection_config:
375
+ destination_entry = {
376
+ k: v
377
+ for k, v in destination_entry.items()
378
+ if isinstance(destination_connection_config, v.connection_config)
379
+ }
380
+ if stager_config:
381
+ destination_entry = {
382
+ k: v
383
+ for k, v in destination_entry.items()
384
+ if isinstance(stager_config, v.upload_stager_config)
385
+ }
386
+
387
+ if len(destination_entry) > 1:
388
+ raise ValueError(
389
+ f"multiple entries found matching provided uploader, "
390
+ f"stager and connection configs: {destination_entry}"
391
+ )
392
+ if len(destination_entry) != 1:
393
+ raise ValueError(
394
+ "no entry found in destination registry with matching uploader, "
395
+ "stager and connection configs"
396
+ )
397
+
398
+ destination = list(destination_entry.values())[0]
399
+ if stager_config:
400
+ pipeline_kwargs["stager"] = destination.upload_stager(
401
+ upload_stager_config=stager_config
402
+ )
403
+ if uploader_config:
404
+ uploader_kwargs = {"upload_config": uploader_config}
405
+ if destination_connection_config:
406
+ uploader_kwargs["connection_config"] = destination_connection_config
407
+ pipeline_kwargs["uploader"] = destination.uploader(**uploader_kwargs)
408
+ return cls(**pipeline_kwargs)
File without changes
@@ -0,0 +1,78 @@
1
+ import asyncio
2
+ import hashlib
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import Callable, Optional, TypedDict
6
+
7
+ from unstructured_ingest.data_types.file_data import FileData, file_data_from_file
8
+ from unstructured_ingest.logger import logger
9
+ from unstructured_ingest.pipeline.interfaces import PipelineStep
10
+ from unstructured_ingest.processes.chunker import Chunker
11
+ from unstructured_ingest.utils.data_prep import write_data
12
+ from unstructured_ingest.utils.pydantic_models import serialize_base_model_json
13
+
14
+ STEP_ID = "chunk"
15
+
16
+
17
+ class ChunkStepResponse(TypedDict):
18
+ file_data_path: str
19
+ path: str
20
+
21
+
22
+ @dataclass
23
+ class ChunkStep(PipelineStep):
24
+ process: Chunker
25
+ identifier: str = STEP_ID
26
+
27
+ def __str__(self):
28
+ return f"{self.identifier} ({self.process.config.chunking_strategy})"
29
+
30
+ def __post_init__(self):
31
+ config = self.process.config.model_dump_json() if self.process.config else None
32
+ logger.info(f"created {self.identifier} with configs: {config}")
33
+
34
+ def should_chunk(self, filepath: Path, file_data: FileData) -> bool:
35
+ if self.context.reprocess or file_data.reprocess:
36
+ return True
37
+ return not filepath.exists()
38
+
39
+ def get_output_filepath(self, filename: Path) -> Path:
40
+ hashed_output_file = f"{self.get_hash(extras=[filename.name])}.ndjson"
41
+ filepath = (self.cache_dir / hashed_output_file).resolve()
42
+ filepath.parent.mkdir(parents=True, exist_ok=True)
43
+ return filepath
44
+
45
+ def _save_output(self, output_filepath: str, chunked_content: list[dict]):
46
+ logger.debug(f"writing chunker output to: {output_filepath}")
47
+ write_data(path=Path(output_filepath), data=chunked_content)
48
+
49
+ async def _run_async(
50
+ self, fn: Callable, path: str, file_data_path: str, **kwargs
51
+ ) -> ChunkStepResponse:
52
+ path = Path(path)
53
+ file_data = file_data_from_file(path=file_data_path)
54
+ output_filepath = self.get_output_filepath(filename=path)
55
+ if not self.should_chunk(filepath=output_filepath, file_data=file_data):
56
+ logger.debug(f"skipping chunking, output already exists: {output_filepath}")
57
+ return ChunkStepResponse(file_data_path=file_data_path, path=str(output_filepath))
58
+ fn_kwargs = {"elements_filepath": path}
59
+ if not asyncio.iscoroutinefunction(fn):
60
+ chunked_content_raw = fn(**fn_kwargs)
61
+ elif semaphore := self.context.semaphore:
62
+ async with semaphore:
63
+ chunked_content_raw = await fn(**fn_kwargs)
64
+ else:
65
+ chunked_content_raw = await fn(**fn_kwargs)
66
+ self._save_output(
67
+ output_filepath=str(output_filepath),
68
+ chunked_content=chunked_content_raw,
69
+ )
70
+ return ChunkStepResponse(file_data_path=file_data_path, path=str(output_filepath))
71
+
72
+ def get_hash(self, extras: Optional[list[str]]) -> str:
73
+ hashable_string = serialize_base_model_json(
74
+ model=self.process.config, sort_keys=True, ensure_ascii=True
75
+ )
76
+ if extras:
77
+ hashable_string += "".join(extras)
78
+ return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
@@ -0,0 +1,206 @@
1
+ import asyncio
2
+ import hashlib
3
+ import json
4
+ import shutil
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import Callable, Optional, TypedDict, TypeVar
8
+
9
+ from unstructured_ingest.data_types.file_data import FileData, file_data_from_file
10
+ from unstructured_ingest.interfaces import Downloader, download_responses
11
+ from unstructured_ingest.logger import logger
12
+ from unstructured_ingest.pipeline.interfaces import PipelineStep
13
+ from unstructured_ingest.utils.pydantic_models import serialize_base_model_json
14
+
15
+ DownloaderT = TypeVar("DownloaderT", bound=Downloader)
16
+
17
+ STEP_ID = "download"
18
+
19
+
20
+ class DownloadStepResponse(TypedDict):
21
+ file_data_path: str
22
+ path: str
23
+
24
+
25
+ @dataclass
26
+ class DownloadStep(PipelineStep):
27
+ process: DownloaderT
28
+ identifier: str = STEP_ID
29
+
30
+ def __str__(self):
31
+ return f"{self.identifier} ({self.process.__class__.__name__})"
32
+
33
+ def __post_init__(self):
34
+ config = (
35
+ self.process.download_config.model_dump_json() if self.process.download_config else None
36
+ )
37
+ connection_config = (
38
+ self.process.connection_config.model_dump_json()
39
+ if self.process.connection_config
40
+ else None
41
+ )
42
+ logger.info(
43
+ f"Created {self.identifier} with configs: {config}, "
44
+ f"connection configs: {connection_config}"
45
+ )
46
+
47
+ @staticmethod
48
+ def is_float(value: str):
49
+ try:
50
+ float(value)
51
+ return True
52
+ except ValueError:
53
+ return False
54
+
55
+ def should_download(self, file_data: FileData, file_data_path: str) -> bool:
56
+ if self.context.re_download:
57
+ return True
58
+ download_path = self.process.get_download_path(file_data=file_data)
59
+ if not download_path or not download_path.exists():
60
+ return True
61
+ if (
62
+ download_path.is_file()
63
+ and file_data.metadata.date_modified
64
+ and self.is_float(file_data.metadata.date_modified)
65
+ and download_path.stat().st_mtime > float(file_data.metadata.date_modified)
66
+ ):
67
+ # Also update file data to mark this to reprocess since this won't change the filename
68
+ file_data.reprocess = True
69
+ file_data.to_file(path=file_data_path)
70
+ return True
71
+ return False
72
+
73
+ def update_file_data(
74
+ self, file_data: FileData, file_data_path: Path, download_path: Path
75
+ ) -> None:
76
+ file_data.local_download_path = str(download_path.resolve())
77
+ file_size_bytes = download_path.stat().st_size
78
+ if not file_data.metadata.filesize_bytes and file_size_bytes:
79
+ file_data.metadata.filesize_bytes = file_size_bytes
80
+ if (
81
+ file_data.metadata.filesize_bytes
82
+ and file_data.metadata.filesize_bytes != file_size_bytes
83
+ ):
84
+ logger.warning(
85
+ f"file size in original file data "
86
+ f"({file_data.metadata.filesize_bytes}) doesn't "
87
+ f"match size of local file: {file_size_bytes}, updating"
88
+ )
89
+ file_data.metadata.filesize_bytes = file_size_bytes
90
+ logger.debug(f"updating file data with new content: {file_data.model_dump_json()}")
91
+ with file_data_path.open("w") as file:
92
+ file.write(file_data.model_dump_json(indent=2))
93
+
94
+ async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
95
+ file_data = file_data_from_file(path=file_data_path)
96
+ download_path = self.process.get_download_path(file_data=file_data)
97
+ if not self.should_download(file_data=file_data, file_data_path=file_data_path):
98
+ logger.debug(f"skipping download, file already exists locally: {download_path}")
99
+ self.update_file_data(
100
+ file_data=file_data,
101
+ file_data_path=Path(file_data_path),
102
+ download_path=download_path,
103
+ )
104
+ return [DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))]
105
+ fn_kwargs = {"file_data": file_data}
106
+ if not asyncio.iscoroutinefunction(fn):
107
+ download_results = fn(**fn_kwargs)
108
+ elif semaphore := self.context.semaphore:
109
+ async with semaphore:
110
+ download_results = await fn(**fn_kwargs)
111
+ else:
112
+ download_results = await fn(**fn_kwargs)
113
+ return self.create_step_results(
114
+ current_file_data_path=file_data_path,
115
+ download_results=download_results,
116
+ current_file_data=file_data,
117
+ )
118
+
119
+ def create_step_results(
120
+ self,
121
+ current_file_data_path: str,
122
+ current_file_data: FileData,
123
+ download_results: download_responses,
124
+ ) -> list[DownloadStepResponse]:
125
+ responses = []
126
+ if not isinstance(download_results, list):
127
+ file_data = current_file_data
128
+ file_data_path = current_file_data_path
129
+ download_path = download_results["path"]
130
+ if download_results["file_data"].identifier == current_file_data.identifier:
131
+ self.update_file_data(
132
+ file_data=file_data,
133
+ file_data_path=Path(file_data_path),
134
+ download_path=download_path,
135
+ )
136
+ responses = [
137
+ DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))
138
+ ]
139
+ else:
140
+ file_data = download_results["file_data"]
141
+ file_data_path = self.persist_new_file_data(file_data=file_data)
142
+ self.update_file_data(
143
+ file_data=file_data,
144
+ file_data_path=Path(file_data_path),
145
+ download_path=download_path,
146
+ )
147
+ responses = [
148
+ DownloadStepResponse(
149
+ file_data_path=current_file_data_path, path=str(download_results["path"])
150
+ )
151
+ ]
152
+ else:
153
+ # Supplemental results generated as part of the download process
154
+ for res in download_results:
155
+ file_data = res["file_data"]
156
+ file_data_path = self.persist_new_file_data(file_data=file_data)
157
+ download_path = res["path"]
158
+ self.update_file_data(
159
+ file_data=file_data,
160
+ file_data_path=Path(file_data_path),
161
+ download_path=download_path,
162
+ )
163
+ responses.append(
164
+ DownloadStepResponse(file_data_path=file_data_path, path=res["path"])
165
+ )
166
+
167
+ return responses
168
+
169
+ def persist_new_file_data(self, file_data: FileData) -> str:
170
+ record_hash = self.get_hash(extras=[file_data.identifier])
171
+ filename = f"{record_hash}.json"
172
+ filepath = (self.cache_dir / filename).resolve()
173
+ filepath.parent.mkdir(parents=True, exist_ok=True)
174
+ with open(str(filepath), "w") as f:
175
+ f.write(file_data.model_dump_json(indent=2))
176
+ return str(filepath)
177
+
178
+ def get_hash(self, extras: Optional[list[str]]) -> str:
179
+ download_config_dict = json.loads(
180
+ serialize_base_model_json(model=self.process.download_config)
181
+ )
182
+ connection_config_dict = json.loads(
183
+ serialize_base_model_json(model=self.process.connection_config)
184
+ )
185
+ hashable_dict = {
186
+ "download_config": download_config_dict,
187
+ "connection_config": connection_config_dict,
188
+ }
189
+ hashable_string = json.dumps(hashable_dict, sort_keys=True)
190
+ if extras:
191
+ hashable_string += "".join(extras)
192
+ return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
193
+
194
+ @property
195
+ def cache_dir(self) -> Path:
196
+ return self.process.download_config.download_dir
197
+
198
+ def delete_cache(self):
199
+ if (
200
+ self.context.iter_delete
201
+ and not self.context.preserve_downloads
202
+ and self.cache_dir.exists()
203
+ ):
204
+ cache_dir = self.cache_dir
205
+ logger.info(f"deleting {self.identifier} cache dir {cache_dir}")
206
+ shutil.rmtree(cache_dir)