unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1 @@
1
+ from __future__ import annotations
@@ -0,0 +1 @@
1
+ __version__ = "1.2.32" # pragma: no cover
@@ -0,0 +1,28 @@
1
+ # Ingest CLI
2
+ This package helps map user input via a cli to the underlying ingest code to run a small ETL pipeline.
3
+
4
+ ## Design Reference
5
+ [cli.py](cli.py) is the main entrypoint to run the cli itself. The key points for this is the interaction between all
6
+ source and destination connectors.
7
+
8
+ To manually run the cli:
9
+ ```shell
10
+ PYTHONPATH=. python unstructured_ingest/main.py --help
11
+ ```
12
+
13
+ The `main.py` file simply wraps the generated Click command created in `cli.py`.
14
+
15
+ ### Source Commands
16
+ All source commands are added as sub commands to the parent ingest Click group. This allows each command to map to
17
+ different connectors with shared and unique parameters.
18
+
19
+ ### Destination Commands
20
+ All destination commands are added as sub commands to each parent source command. This allows each invocation of the source
21
+ sub command to display all possible destination subcommands. The code un [utils.py](./utils.py) helps structure the
22
+ generated text from the Click library to be more intuitive on this approach (i.e. list sub commands as `Destinations`).
23
+
24
+ ### Configs
25
+ The configs in [configs/](./configs) and connector specific ones in [cmds/](./cmds) help surface all user parameters that
26
+ are needed to marshall the input dictionary from Click into all the respective configs needed to create a full pipeline run.
27
+ Because click returns a flat dictionary of user inputs, the `extract_config` method in `utils.py` helps deserialize this dictionary
28
+ into dataclasses that have nested fields (such as access configs).
File without changes
@@ -0,0 +1,4 @@
1
+ from .dest import DestCmd
2
+ from .src import SrcCmd
3
+
4
+ __all__ = ["SrcCmd", "DestCmd"]
@@ -0,0 +1,269 @@
1
+ import inspect
2
+ from abc import ABC, abstractmethod
3
+ from collections import Counter
4
+ from dataclasses import dataclass, field, fields
5
+ from typing import Any, Optional, Type, TypeVar
6
+
7
+ import click
8
+ from pydantic import BaseModel
9
+
10
+ from unstructured_ingest.cli.base.importer import import_from_string
11
+ from unstructured_ingest.cli.utils.click import extract_config
12
+ from unstructured_ingest.cli.utils.model_conversion import options_from_base_model, post_check
13
+ from unstructured_ingest.interfaces import ProcessorConfig
14
+ from unstructured_ingest.logger import logger
15
+ from unstructured_ingest.pipeline.pipeline import Pipeline
16
+ from unstructured_ingest.processes.chunker import Chunker, ChunkerConfig
17
+ from unstructured_ingest.processes.connector_registry import (
18
+ DownloaderT,
19
+ IndexerT,
20
+ RegistryEntry,
21
+ UploaderT,
22
+ UploadStager,
23
+ UploadStagerConfig,
24
+ UploadStagerT,
25
+ destination_registry,
26
+ source_registry,
27
+ )
28
+ from unstructured_ingest.processes.connectors.local import LocalUploader, LocalUploaderConfig
29
+ from unstructured_ingest.processes.embedder import Embedder, EmbedderConfig
30
+ from unstructured_ingest.processes.filter import Filterer, FiltererConfig
31
+ from unstructured_ingest.processes.partitioner import Partitioner, PartitionerConfig
32
+
33
+ CommandT = TypeVar("CommandT", bound=click.Command)
34
+
35
+
36
+ @dataclass
37
+ class BaseCmd(ABC):
38
+ cmd_name: str
39
+ registry_entry: RegistryEntry
40
+ default_configs: list[Type[BaseModel]] = field(default_factory=list)
41
+
42
+ @abstractmethod
43
+ def get_registry_options(self):
44
+ pass
45
+
46
+ def get_default_options(self) -> list[click.Option]:
47
+ options = []
48
+ for extra in self.default_configs:
49
+ options.extend(options_from_base_model(model=extra))
50
+ return options
51
+
52
+ @classmethod
53
+ def consolidate_options(cls, options: list[click.Option]) -> list[click.Option]:
54
+ option_names = [option.name for option in options]
55
+ duplicate_names = [name for name, count in Counter(option_names).items() if count > 1]
56
+ if not duplicate_names:
57
+ return options
58
+ consolidated_options = []
59
+ current_names = []
60
+ for option in options:
61
+ if option.name not in current_names:
62
+ current_names.append(option.name)
63
+ consolidated_options.append(option)
64
+ continue
65
+ existing_option = next(o for o in consolidated_options if o.name == option.name)
66
+ if existing_option.__dict__ == option.__dict__:
67
+ continue
68
+ option_diff = cls.get_options_diff(o1=option, o2=existing_option)
69
+ raise ValueError(
70
+ "Conflicting duplicate {} option defined: {}".format(
71
+ option.name, " | ".join([f"{d[0]}: {d[1]}" for d in option_diff])
72
+ )
73
+ )
74
+ return consolidated_options
75
+
76
+ @staticmethod
77
+ def get_options_diff(o1: click.Option, o2: click.Option):
78
+ o1_dict = o1.__dict__
79
+ o2_dict = o2.__dict__
80
+ for d in [o1_dict, o2_dict]:
81
+ d["opts"] = ",".join(d["opts"])
82
+ d["secondary_opts"] = ",".join(d["secondary_opts"])
83
+ option_diff = set(o1_dict.items()) ^ set(o2_dict.items())
84
+ return option_diff
85
+
86
+ @property
87
+ def cmd_name_key(self):
88
+ return self.cmd_name.replace("-", "_")
89
+
90
+ @property
91
+ def cli_cmd_name(self):
92
+ return self.cmd_name.replace("_", "-")
93
+
94
+ @abstractmethod
95
+ def cmd(self, ctx: click.Context, **options) -> None:
96
+ pass
97
+
98
+ def add_options(self, cmd: CommandT) -> CommandT:
99
+ options = self.get_registry_options()
100
+ options.extend(self.get_default_options())
101
+ post_check(options=options, name=cmd.name)
102
+ cmd.params.extend(options)
103
+ return cmd
104
+
105
+ def get_pipeline(
106
+ self,
107
+ src: str,
108
+ source_options: dict[str, Any],
109
+ dest: Optional[str] = None,
110
+ destination_options: Optional[dict[str, Any]] = None,
111
+ ) -> Pipeline:
112
+ logger.debug(
113
+ f"creating pipeline from cli using source {src} with options: {source_options}"
114
+ )
115
+ pipeline_kwargs: dict[str, Any] = {
116
+ "context": self.get_processor_config(options=source_options),
117
+ "downloader": self.get_downloader(src=src, options=source_options),
118
+ "indexer": self.get_indexer(src=src, options=source_options),
119
+ "partitioner": self.get_partitioner(options=source_options),
120
+ }
121
+ if chunker := self.get_chunker(options=source_options):
122
+ pipeline_kwargs["chunker"] = chunker
123
+ if filterer := self.get_filterer(options=source_options):
124
+ pipeline_kwargs["filterer"] = filterer
125
+ if embedder := self.get_embedder(options=source_options):
126
+ pipeline_kwargs["embedder"] = embedder
127
+ if dest:
128
+ logger.debug(
129
+ f"setting destination on pipeline {dest} with options: {destination_options}"
130
+ )
131
+ if uploader_stager := self.get_upload_stager(dest=dest, options=destination_options):
132
+ pipeline_kwargs["stager"] = uploader_stager
133
+ pipeline_kwargs["uploader"] = self.get_uploader(dest=dest, options=destination_options)
134
+ else:
135
+ # Default to local uploader
136
+ # TODO remove after v1 no longer supported
137
+ destination_options = destination_options or {}
138
+ if "output_dir" not in destination_options:
139
+ destination_options["output_dir"] = source_options["output_dir"]
140
+ pipeline_kwargs["uploader"] = self.get_default_uploader(options=destination_options)
141
+ return Pipeline(**pipeline_kwargs)
142
+
143
+ @staticmethod
144
+ def get_default_uploader(options: dict[str, Any]) -> UploaderT:
145
+ uploader_config = extract_config(flat_data=options, config=LocalUploaderConfig)
146
+ return LocalUploader(upload_config=uploader_config)
147
+
148
+ @staticmethod
149
+ def get_chunker(options: dict[str, Any]) -> Optional[Chunker]:
150
+ chunker_config = extract_config(flat_data=options, config=ChunkerConfig)
151
+ if not chunker_config.chunking_strategy:
152
+ return None
153
+ return Chunker(config=chunker_config)
154
+
155
+ @staticmethod
156
+ def get_filterer(options: dict[str, Any]) -> Optional[Filterer]:
157
+ filterer_configs = extract_config(flat_data=options, config=FiltererConfig)
158
+ if not filterer_configs.model_dump():
159
+ return None
160
+ return Filterer(config=filterer_configs)
161
+
162
+ @staticmethod
163
+ def get_embedder(options: dict[str, Any]) -> Optional[Embedder]:
164
+ embedder_config = extract_config(flat_data=options, config=EmbedderConfig)
165
+ if not embedder_config.embedding_provider:
166
+ return None
167
+ return Embedder(config=embedder_config)
168
+
169
+ @staticmethod
170
+ def get_partitioner(options: dict[str, Any]) -> Partitioner:
171
+ partitioner_config = extract_config(flat_data=options, config=PartitionerConfig)
172
+ return Partitioner(config=partitioner_config)
173
+
174
+ @staticmethod
175
+ def get_processor_config(options: dict[str, Any]) -> ProcessorConfig:
176
+ return extract_config(flat_data=options, config=ProcessorConfig)
177
+
178
+ @staticmethod
179
+ def get_indexer(src: str, options: dict[str, Any]) -> IndexerT:
180
+ source_entry = source_registry[src]
181
+ indexer_kwargs: dict[str, Any] = {}
182
+ if indexer_config_cls := source_entry.indexer_config:
183
+ indexer_kwargs["index_config"] = extract_config(
184
+ flat_data=options, config=indexer_config_cls
185
+ )
186
+ if connection_config_cls := source_entry.connection_config:
187
+ indexer_kwargs["connection_config"] = extract_config(
188
+ flat_data=options, config=connection_config_cls
189
+ )
190
+ indexer_cls = source_entry.indexer
191
+ return indexer_cls(**indexer_kwargs)
192
+
193
+ @staticmethod
194
+ def get_downloader(src: str, options: dict[str, Any]) -> DownloaderT:
195
+ source_entry = source_registry[src]
196
+ downloader_kwargs: dict[str, Any] = {}
197
+ if downloader_config_cls := source_entry.downloader_config:
198
+ downloader_kwargs["download_config"] = extract_config(
199
+ flat_data=options, config=downloader_config_cls
200
+ )
201
+ if connection_config_cls := source_entry.connection_config:
202
+ downloader_kwargs["connection_config"] = extract_config(
203
+ flat_data=options, config=connection_config_cls
204
+ )
205
+ downloader_cls = source_entry.downloader
206
+ return downloader_cls(**downloader_kwargs)
207
+
208
+ @staticmethod
209
+ def get_custom_stager(
210
+ stager_reference: str, stager_config_kwargs: Optional[dict] = None
211
+ ) -> Optional[UploadStagerT]:
212
+ uploader_cls = import_from_string(stager_reference)
213
+ if not inspect.isclass(uploader_cls):
214
+ raise ValueError(
215
+ f"custom stager must be a reference to a python class, got: {type(uploader_cls)}"
216
+ )
217
+ if not issubclass(uploader_cls, UploadStager):
218
+ raise ValueError(
219
+ "custom stager must be an implementation of the UploadStager interface"
220
+ )
221
+ fields_dict = {f.name: f.type for f in fields(uploader_cls)}
222
+ upload_stager_config_cls = fields_dict["upload_stager_config"]
223
+ if not inspect.isclass(upload_stager_config_cls):
224
+ raise ValueError(
225
+ f"custom stager config must be a class, got: {type(upload_stager_config_cls)}"
226
+ )
227
+ if not issubclass(upload_stager_config_cls, UploadStagerConfig):
228
+ raise ValueError(
229
+ "custom stager config must be an implementation "
230
+ "of the UploadStagerUploadStagerConfig interface"
231
+ )
232
+ upload_stager_kwargs: dict[str, Any] = {}
233
+ if stager_config_kwargs:
234
+ upload_stager_kwargs["upload_stager_config"] = upload_stager_config_cls(
235
+ **stager_config_kwargs
236
+ )
237
+ return uploader_cls(**upload_stager_kwargs)
238
+
239
+ @staticmethod
240
+ def get_upload_stager(dest: str, options: dict[str, Any]) -> Optional[UploadStagerT]:
241
+ if custom_stager := options.get("custom_stager"):
242
+ return BaseCmd.get_custom_stager(
243
+ stager_reference=custom_stager,
244
+ stager_config_kwargs=options.get("custom_stager_config_kwargs"),
245
+ )
246
+ dest_entry = destination_registry[dest]
247
+ upload_stager_kwargs: dict[str, Any] = {}
248
+ if upload_stager_config_cls := dest_entry.upload_stager_config:
249
+ upload_stager_kwargs["upload_stager_config"] = extract_config(
250
+ flat_data=options, config=upload_stager_config_cls
251
+ )
252
+ if upload_stager_cls := dest_entry.upload_stager:
253
+ return upload_stager_cls(**upload_stager_kwargs)
254
+ return None
255
+
256
+ @staticmethod
257
+ def get_uploader(dest, options: dict[str, Any]) -> UploaderT:
258
+ dest_entry = destination_registry[dest]
259
+ uploader_kwargs: dict[str, Any] = {}
260
+ if uploader_config_cls := dest_entry.uploader_config:
261
+ uploader_kwargs["upload_config"] = extract_config(
262
+ flat_data=options, config=uploader_config_cls
263
+ )
264
+ if connection_config_cls := dest_entry.connection_config:
265
+ uploader_kwargs["connection_config"] = extract_config(
266
+ flat_data=options, config=connection_config_cls
267
+ )
268
+ uploader_cls = dest_entry.uploader
269
+ return uploader_cls(**uploader_kwargs)
@@ -0,0 +1,84 @@
1
+ import logging
2
+ from dataclasses import dataclass
3
+
4
+ import click
5
+
6
+ from unstructured_ingest.cli.base.cmd import BaseCmd
7
+ from unstructured_ingest.cli.utils.click import Dict, conform_click_options
8
+ from unstructured_ingest.cli.utils.model_conversion import options_from_base_model
9
+ from unstructured_ingest.logger import logger
10
+ from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
11
+
12
+
13
+ @dataclass
14
+ class DestCmd(BaseCmd):
15
+ registry_entry: DestinationRegistryEntry
16
+
17
+ def get_registry_options(self):
18
+ options = []
19
+ configs = [
20
+ config
21
+ for config in [
22
+ self.registry_entry.uploader_config,
23
+ self.registry_entry.upload_stager_config,
24
+ self.registry_entry.connection_config,
25
+ ]
26
+ if config
27
+ ]
28
+ for config in configs:
29
+ options.extend(options_from_base_model(model=config))
30
+ options = self.consolidate_options(options=options)
31
+ return options
32
+
33
+ def cmd(self, ctx: click.Context, **options) -> None:
34
+ logger.setLevel(logging.DEBUG if options.get("verbose", False) else logging.INFO)
35
+ if not ctx.parent:
36
+ raise click.ClickException("destination command called without a parent")
37
+ if not ctx.parent.info_name:
38
+ raise click.ClickException("parent command missing info name")
39
+ source_cmd = ctx.parent.info_name.replace("-", "_")
40
+ source_options: dict = ctx.parent.params if ctx.parent else {}
41
+ conform_click_options(options)
42
+ try:
43
+ pipeline = self.get_pipeline(
44
+ src=source_cmd,
45
+ source_options=source_options,
46
+ dest=self.cmd_name,
47
+ destination_options=options,
48
+ )
49
+ pipeline.run()
50
+ except Exception as e:
51
+ logger.error(f"failed to run destination command {self.cmd_name}: {e}", exc_info=True)
52
+ raise click.ClickException(str(e)) from e
53
+
54
+ def get_cmd(self) -> click.Command:
55
+ # Dynamically create the command without the use of click decorators
56
+ fn = self.cmd
57
+ fn = click.pass_context(fn)
58
+ cmd = click.command(fn)
59
+ if not isinstance(cmd, click.core.Command):
60
+ raise ValueError(f"generated command was not of expected type Command: {type(cmd)}")
61
+ cmd.name = self.cli_cmd_name
62
+ cmd.invoke_without_command = True
63
+ self.add_options(cmd)
64
+ cmd.params.append(
65
+ click.Option(
66
+ ["--custom-stager"],
67
+ required=False,
68
+ type=str,
69
+ default=None,
70
+ help="Pass a pointer to a custom upload stager to use, "
71
+ "must be in format '<module>:<attribute>'",
72
+ )
73
+ )
74
+ cmd.params.append(
75
+ click.Option(
76
+ ["--custom-stager-config-kwargs"],
77
+ required=False,
78
+ type=Dict(),
79
+ default=None,
80
+ help="Any kwargs to instantiate the configuration "
81
+ "associated with the customer stager",
82
+ )
83
+ )
84
+ return cmd
@@ -0,0 +1,34 @@
1
+ import importlib
2
+ from typing import Any
3
+
4
+
5
+ class ImportFromStringError(Exception):
6
+ pass
7
+
8
+
9
+ def import_from_string(import_str: Any) -> Any:
10
+ if not isinstance(import_str, str):
11
+ return import_str
12
+
13
+ module_str, _, attrs_str = import_str.partition(":")
14
+ if not module_str or not attrs_str:
15
+ message = 'Import string "{import_str}" must be in format "<module>:<attribute>".'
16
+ raise ImportFromStringError(message.format(import_str=import_str))
17
+
18
+ try:
19
+ module = importlib.import_module(module_str)
20
+ except ModuleNotFoundError as exc:
21
+ if exc.name != module_str:
22
+ raise exc from None
23
+ message = 'Could not import module "{module_str}".'
24
+ raise ImportFromStringError(message.format(module_str=module_str))
25
+
26
+ instance = module
27
+ try:
28
+ for attr_str in attrs_str.split("."):
29
+ instance = getattr(instance, attr_str)
30
+ except AttributeError:
31
+ message = 'Attribute "{attrs_str}" not found in module "{module_str}".'
32
+ raise ImportFromStringError(message.format(attrs_str=attrs_str, module_str=module_str))
33
+
34
+ return instance
@@ -0,0 +1,75 @@
1
+ import logging
2
+ from dataclasses import dataclass, field
3
+ from typing import Any
4
+
5
+ import click
6
+ from pydantic import BaseModel
7
+
8
+ from unstructured_ingest.cli.base.cmd import BaseCmd
9
+ from unstructured_ingest.cli.utils.click import Group, conform_click_options
10
+ from unstructured_ingest.cli.utils.model_conversion import options_from_base_model
11
+ from unstructured_ingest.interfaces import ProcessorConfig
12
+ from unstructured_ingest.logger import logger
13
+ from unstructured_ingest.processes import (
14
+ ChunkerConfig,
15
+ EmbedderConfig,
16
+ FiltererConfig,
17
+ PartitionerConfig,
18
+ )
19
+ from unstructured_ingest.processes.connector_registry import SourceRegistryEntry
20
+
21
+
22
+ @dataclass
23
+ class SrcCmd(BaseCmd):
24
+ registry_entry: SourceRegistryEntry
25
+ default_configs: list[BaseModel] = field(
26
+ default_factory=lambda: [
27
+ ProcessorConfig,
28
+ PartitionerConfig,
29
+ EmbedderConfig,
30
+ FiltererConfig,
31
+ ChunkerConfig,
32
+ ]
33
+ )
34
+
35
+ def get_registry_options(self):
36
+ options = []
37
+ configs = [
38
+ config
39
+ for config in [
40
+ self.registry_entry.connection_config,
41
+ self.registry_entry.indexer_config,
42
+ self.registry_entry.downloader_config,
43
+ ]
44
+ if config
45
+ ]
46
+ for config in configs:
47
+ options.extend(options_from_base_model(model=config))
48
+ options = self.consolidate_options(options=options)
49
+ return options
50
+
51
+ def cmd(self, ctx: click.Context, **options: dict[str, Any]) -> None:
52
+ if ctx.invoked_subcommand:
53
+ return
54
+
55
+ conform_click_options(options)
56
+ logger.setLevel(logging.DEBUG if options.get("verbose", False) else logging.INFO)
57
+ try:
58
+ pipeline = self.get_pipeline(src=self.cmd_name, source_options=options)
59
+ pipeline.run()
60
+ except Exception as e:
61
+ logger.error(f"failed to run source command {self.cmd_name}: {e}", exc_info=True)
62
+ raise click.ClickException(str(e)) from e
63
+
64
+ def get_cmd(self) -> click.Group:
65
+ # Dynamically create the command without the use of click decorators
66
+ fn = self.cmd
67
+ fn = click.pass_context(fn)
68
+ cmd = click.group(fn, cls=Group)
69
+ if not isinstance(cmd, click.core.Group):
70
+ raise ValueError(f"generated src command was not of expected type Group: {type(cmd)}")
71
+ cmd.name = self.cli_cmd_name
72
+ cmd.invoke_without_command = True
73
+ self.add_options(cmd)
74
+
75
+ return cmd
@@ -0,0 +1,24 @@
1
+ import click
2
+
3
+ from unstructured_ingest.cli.cmds import dest, src
4
+
5
+
6
+ @click.group()
7
+ def ingest():
8
+ pass
9
+
10
+
11
+ def get_cmd() -> click.Command:
12
+ """Construct and return a Click command object representing the main command for the CLI.
13
+
14
+ This function adds all dest_subcommand(s) to each src_subcommand, and adds all of those
15
+ to the main command as nested subcommands.
16
+ """
17
+ cmd = ingest
18
+ # Add all subcommands
19
+ for src_subcommand in src:
20
+ # Add all destination subcommands
21
+ for dest_subcommand in dest:
22
+ src_subcommand.add_command(dest_subcommand)
23
+ cmd.add_command(src_subcommand)
24
+ return cmd
@@ -0,0 +1,14 @@
1
+ import click
2
+
3
+ from unstructured_ingest.cli.base import DestCmd, SrcCmd
4
+ from unstructured_ingest.processes.connector_registry import (
5
+ destination_registry,
6
+ source_registry,
7
+ )
8
+
9
+ src_cmds = [SrcCmd(cmd_name=k, registry_entry=v) for k, v in source_registry.items()]
10
+ dest_cmds = [DestCmd(cmd_name=k, registry_entry=v) for k, v in destination_registry.items()]
11
+
12
+ src: list[click.Group] = [v.get_cmd() for v in src_cmds]
13
+
14
+ dest: list[click.Command] = [v.get_cmd() for v in dest_cmds]
File without changes