unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,233 @@
1
+ from abc import ABC
2
+ from dataclasses import dataclass
3
+ from pathlib import Path
4
+ from typing import Any, Optional
5
+
6
+ from pydantic import BaseModel, Field, SecretStr
7
+
8
+ from unstructured_ingest.error import UserError
9
+ from unstructured_ingest.interfaces.process import BaseProcess
10
+ from unstructured_ingest.logger import logger
11
+ from unstructured_ingest.unstructured_api import call_api_async
12
+ from unstructured_ingest.utils.data_prep import flatten_dict
13
+ from unstructured_ingest.utils.dep_check import requires_dependencies
14
+
15
+
16
+ class PartitionerConfig(BaseModel):
17
+ strategy: str = Field(
18
+ default="auto",
19
+ description="The method that will be used to process the documents. ",
20
+ examples=["fast", "hi_res", "auto"],
21
+ )
22
+ ocr_languages: Optional[list[str]] = Field(
23
+ default=None,
24
+ description="A list of language packs to specify which languages to use for OCR, "
25
+ "The appropriate Tesseract language pack needs to be installed.",
26
+ examples=["eng", "deu", "eng,deu"],
27
+ )
28
+ encoding: Optional[str] = Field(
29
+ default=None,
30
+ description="Text encoding to use when reading documents. "
31
+ "By default the encoding is detected automatically.",
32
+ )
33
+ additional_partition_args: Optional[dict[str, Any]] = Field(
34
+ default=None, description="Additional values to pass through to partition()"
35
+ )
36
+ skip_infer_table_types: Optional[list[str]] = Field(
37
+ default=None, description="Optional list of document data_types to skip table extraction on"
38
+ )
39
+ fields_include: list[str] = Field(
40
+ default_factory=lambda: ["element_id", "text", "type", "metadata", "embeddings"],
41
+ description="If set, include the specified top-level fields in an element.",
42
+ )
43
+ flatten_metadata: bool = Field(
44
+ default=False,
45
+ description="Results in flattened json elements. "
46
+ "Specifically, the metadata key values are brought to "
47
+ "the top-level of the element, and the `metadata` key itself is removed.",
48
+ )
49
+ metadata_exclude: list[str] = Field(
50
+ default_factory=list,
51
+ description="If set, drop the specified metadata fields if they exist.",
52
+ )
53
+ element_exclude: list[str] = Field(
54
+ default_factory=list,
55
+ description="If set, drop the specified element data_types, if they exist.",
56
+ )
57
+ metadata_include: list[str] = Field(
58
+ default_factory=list,
59
+ description="If set, include the specified metadata "
60
+ "fields if they exist and drop all other fields. ",
61
+ )
62
+ partition_endpoint: Optional[str] = Field(
63
+ default="https://api.unstructuredapp.io/general/v0/general",
64
+ description="If partitioning via api, use the following host.",
65
+ )
66
+ partition_by_api: bool = Field(
67
+ default=False,
68
+ description="Use a remote API to partition the files."
69
+ " Otherwise, use the function from partition.auto",
70
+ )
71
+ api_timeout_ms: Optional[int] = Field(
72
+ default=None, description="Timeout in milliseconds for all api call during partitioning."
73
+ )
74
+ api_key: Optional[SecretStr] = Field(
75
+ default=None, description="API Key for partition endpoint."
76
+ )
77
+ hi_res_model_name: Optional[str] = Field(
78
+ default=None, description="Model name for hi-res strategy."
79
+ )
80
+ raise_unsupported_filetype: bool = Field(
81
+ default=False, description="Raise an error if the file type is not supported"
82
+ )
83
+
84
+ def model_post_init(self, __context: Any) -> None:
85
+ if self.metadata_exclude and self.metadata_include:
86
+ raise ValueError(
87
+ "metadata_exclude and metadata_include are "
88
+ "mutually exclusive with each other. Cannot specify both."
89
+ )
90
+
91
+ def to_partition_kwargs(self) -> dict[str, Any]:
92
+ partition_kwargs: dict[str, Any] = {
93
+ "strategy": self.strategy,
94
+ "languages": self.ocr_languages,
95
+ "hi_res_model_name": self.hi_res_model_name,
96
+ "skip_infer_table_types": self.skip_infer_table_types,
97
+ }
98
+ # Don't inject information if None and allow default values in method to be used
99
+ partition_kwargs = {k: v for k, v in partition_kwargs.items() if v is not None}
100
+ if self.additional_partition_args:
101
+ partition_kwargs.update(self.additional_partition_args)
102
+ return partition_kwargs
103
+
104
+
105
+ @dataclass
106
+ class Partitioner(BaseProcess, ABC):
107
+ config: PartitionerConfig
108
+
109
+ def is_async(self) -> bool:
110
+ return self.config.partition_by_api
111
+
112
+ def postprocess(self, elements: list[dict]) -> list[dict]:
113
+ element_dicts = [e.copy() for e in elements]
114
+ if self.config.element_exclude:
115
+ element_dicts = list(
116
+ filter(
117
+ lambda element: element["type"] not in self.config.element_exclude,
118
+ element_dicts,
119
+ )
120
+ )
121
+ for elem in element_dicts:
122
+ if self.config.metadata_exclude:
123
+ ex_list = self.config.metadata_exclude
124
+ for ex in ex_list:
125
+ if "." in ex: # handle nested fields
126
+ nested_fields = ex.split(".")
127
+ current_elem = elem
128
+ for f in nested_fields[:-1]:
129
+ if f in current_elem:
130
+ current_elem = current_elem[f]
131
+ field_to_exclude = nested_fields[-1]
132
+ if field_to_exclude in current_elem:
133
+ current_elem.pop(field_to_exclude, None)
134
+ else: # handle top-level fields
135
+ elem["metadata"].pop(ex, None) # type: ignore[attr-defined]
136
+ elif self.config.metadata_include:
137
+ in_list = self.config.metadata_include
138
+ for k in list(elem["metadata"].keys()): # type: ignore[attr-defined]
139
+ if k not in in_list:
140
+ elem["metadata"].pop(k, None) # type: ignore[attr-defined]
141
+ in_list = self.config.fields_include
142
+ elem = {k: v for k, v in elem.items() if k in in_list}
143
+
144
+ if self.config.flatten_metadata and "metadata" in elem:
145
+ metadata = elem.pop("metadata")
146
+ elem.update(flatten_dict(metadata, keys_to_omit=["data_source_record_locator"]))
147
+ return element_dicts
148
+
149
+ @requires_dependencies(dependencies=["unstructured"])
150
+ def partition_locally(
151
+ self, filename: Path, metadata: Optional[dict] = None, **kwargs
152
+ ) -> list[dict]:
153
+ from unstructured.documents.elements import DataSourceMetadata
154
+ from unstructured.partition.auto import partition
155
+ from unstructured.staging.base import elements_to_dicts
156
+
157
+ @dataclass
158
+ class FileDataSourceMetadata(DataSourceMetadata):
159
+ filesize_bytes: Optional[int] = None
160
+
161
+ metadata = metadata or {}
162
+ logger.debug(f"using local partition with kwargs: {self.config.to_partition_kwargs()}")
163
+ logger.debug(f"partitioning file {filename} with metadata {metadata}")
164
+ try:
165
+ elements = partition(
166
+ filename=str(filename.resolve()),
167
+ data_source_metadata=FileDataSourceMetadata.from_dict(metadata),
168
+ **self.config.to_partition_kwargs(),
169
+ )
170
+ except ValueError as sdk_error:
171
+ if (
172
+ self.is_unstructured_error_unsupported_filetype(sdk_error=sdk_error)
173
+ and not self.config.raise_unsupported_filetype
174
+ ):
175
+ logger.warning(
176
+ f"Unsupported file type for strategy {self.config.strategy}: {filename}"
177
+ )
178
+ return []
179
+ raise sdk_error
180
+ return self.postprocess(elements=elements_to_dicts(elements))
181
+
182
+ @requires_dependencies(dependencies=["unstructured_client"], extras="remote")
183
+ async def partition_via_api(
184
+ self, filename: Path, metadata: Optional[dict] = None, **kwargs
185
+ ) -> list[dict]:
186
+ metadata = metadata or {}
187
+ logger.debug(f"partitioning file {filename} with metadata: {metadata}")
188
+
189
+ elements = await call_api_async(
190
+ server_url=self.config.partition_endpoint,
191
+ api_key=self.config.api_key.get_secret_value(),
192
+ filename=filename,
193
+ api_parameters=self.config.to_partition_kwargs(),
194
+ timeout_ms=self.config.api_timeout_ms,
195
+ )
196
+
197
+ # Append the data source metadata the auto partition does for you
198
+ for element in elements:
199
+ element["metadata"]["data_source"] = metadata
200
+ return self.postprocess(elements=elements)
201
+
202
+ def is_unstructured_error_unsupported_filetype(self, sdk_error: ValueError) -> bool:
203
+ error_msg = sdk_error.args[0]
204
+ return (
205
+ "Invalid file" in error_msg
206
+ or "Unstructured schema" in error_msg
207
+ or "fast strategy is not available for image files" in error_msg
208
+ )
209
+
210
+ def is_client_error_unsupported_filetype(self, error: UserError) -> bool:
211
+ error_msg = error.args[0]
212
+ return "fast strategy is not available for image files" in error_msg or (
213
+ "file type" in error_msg.lower() and "is not supported" in error_msg.lower()
214
+ )
215
+
216
+ def run(self, filename: Path, metadata: Optional[dict] = None, **kwargs) -> list[dict]:
217
+ return self.partition_locally(filename, metadata=metadata, **kwargs)
218
+
219
+ async def run_async(
220
+ self, filename: Path, metadata: Optional[dict] = None, **kwargs
221
+ ) -> list[dict]:
222
+ try:
223
+ return await self.partition_via_api(filename, metadata=metadata, **kwargs)
224
+ except UserError as user_error:
225
+ if (
226
+ self.is_client_error_unsupported_filetype(error=user_error)
227
+ and not self.config.raise_unsupported_filetype
228
+ ):
229
+ logger.warning(
230
+ f"Unsupported file type for strategy {self.config.strategy}: {filename}"
231
+ )
232
+ return []
233
+ raise user_error
@@ -0,0 +1,61 @@
1
+ from abc import ABC
2
+ from copy import copy
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+ from typing import Any
6
+ from uuid import NAMESPACE_DNS, uuid5
7
+
8
+ from pydantic import BaseModel
9
+
10
+ from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
11
+ from unstructured_ingest.interfaces.process import BaseProcess
12
+ from unstructured_ingest.logger import logger
13
+ from unstructured_ingest.utils.compression import TAR_FILE_EXT, ZIP_FILE_EXT, uncompress_file
14
+
15
+
16
+ class UncompressConfig(BaseModel):
17
+ pass
18
+
19
+
20
+ @dataclass
21
+ class Uncompressor(BaseProcess, ABC):
22
+ config: UncompressConfig = field(default_factory=UncompressConfig)
23
+
24
+ def is_async(self) -> bool:
25
+ return True
26
+
27
+ def run(self, file_data: FileData, **kwargs: Any) -> list[FileData]:
28
+ local_filepath = Path(file_data.local_download_path)
29
+ if local_filepath.suffix not in TAR_FILE_EXT + ZIP_FILE_EXT:
30
+ return [file_data]
31
+ new_path = uncompress_file(filename=str(local_filepath))
32
+ new_files = [i for i in Path(new_path).rglob("*") if i.is_file()]
33
+ responses = []
34
+ logger.debug(
35
+ "uncompressed {} files from original file {}: {}".format(
36
+ len(new_files), local_filepath, ", ".join([str(f) for f in new_files])
37
+ )
38
+ )
39
+ for f in new_files:
40
+ new_file_data = copy(file_data)
41
+ new_file_data.identifier = str(uuid5(NAMESPACE_DNS, str(f)))
42
+ new_file_data.local_download_path = str(f.resolve())
43
+ new_rel_download_path = str(f).replace(str(Path(local_filepath.parent)), "")[1:]
44
+ new_file_data.source_identifiers = SourceIdentifiers(
45
+ filename=f.name,
46
+ fullpath=str(file_data.source_identifiers.fullpath).replace(
47
+ file_data.source_identifiers.filename, new_rel_download_path
48
+ ),
49
+ rel_path=(
50
+ str(file_data.source_identifiers.rel_path).replace(
51
+ file_data.source_identifiers.filename, new_rel_download_path
52
+ )
53
+ if file_data.source_identifiers.rel_path
54
+ else None
55
+ ),
56
+ )
57
+ responses.append(new_file_data)
58
+ return responses
59
+
60
+ async def run_async(self, file_data: FileData, **kwargs: Any) -> list[FileData]:
61
+ return self.run(file_data=file_data, **kwargs)
@@ -0,0 +1,8 @@
1
+ from .logging.connector import ConnectorLoggingMixin, LoggingConfig
2
+ from .logging.sanitizer import DataSanitizer
3
+
4
+ __all__ = [
5
+ "ConnectorLoggingMixin",
6
+ "DataSanitizer",
7
+ "LoggingConfig",
8
+ ]
@@ -0,0 +1,32 @@
1
+ from dataclasses import dataclass, field
2
+ from pathlib import Path
3
+ from typing import Any
4
+
5
+ from unstructured_ingest.data_types.file_data import FileData
6
+ from unstructured_ingest.interfaces import UploadStager, UploadStagerConfig
7
+ from unstructured_ingest.utils.data_prep import get_json_data, write_data
8
+
9
+
10
+ class BlobStoreUploadStagerConfig(UploadStagerConfig):
11
+ pass
12
+
13
+
14
+ @dataclass
15
+ class BlobStoreUploadStager(UploadStager):
16
+ upload_stager_config: BlobStoreUploadStagerConfig = field(
17
+ default_factory=BlobStoreUploadStagerConfig
18
+ )
19
+
20
+ def run(
21
+ self,
22
+ elements_filepath: Path,
23
+ file_data: FileData,
24
+ output_dir: Path,
25
+ output_filename: str,
26
+ **kwargs: Any,
27
+ ) -> Path:
28
+ output_file = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
29
+ # Always save as json
30
+ data = get_json_data(elements_filepath)
31
+ write_data(path=output_file.with_suffix(".json"), data=data)
32
+ return output_file.with_suffix(".json")