unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,89 @@
1
+ from abc import ABC
2
+ from dataclasses import dataclass
3
+ from pathlib import Path
4
+ from typing import Any, TypeVar
5
+
6
+ from pydantic import BaseModel
7
+
8
+ from unstructured_ingest.data_types.file_data import FileData
9
+ from unstructured_ingest.interfaces import BaseProcess
10
+ from unstructured_ingest.utils import ndjson
11
+ from unstructured_ingest.utils.data_prep import get_json_data, write_data
12
+
13
+
14
+ class UploadStagerConfig(BaseModel):
15
+ pass
16
+
17
+
18
+ UploadStagerConfigT = TypeVar("UploadStagerConfigT", bound=UploadStagerConfig)
19
+
20
+
21
+ @dataclass
22
+ class UploadStager(BaseProcess, ABC):
23
+ upload_stager_config: UploadStagerConfigT
24
+
25
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
26
+ return element_dict
27
+
28
+ def get_output_path(self, output_filename: str, output_dir: Path) -> Path:
29
+ output_path = Path(output_filename)
30
+ output_filename = f"{Path(output_filename).stem}{output_path.suffix}"
31
+ output_path = Path(output_dir) / Path(f"{output_filename}")
32
+ output_path.parent.mkdir(parents=True, exist_ok=True)
33
+ return output_path
34
+
35
+ def stream_update(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
36
+ with input_file.open() as in_f:
37
+ reader = ndjson.reader(in_f)
38
+ with output_file.open("w") as out_f:
39
+ writer = ndjson.writer(out_f)
40
+ for element in reader:
41
+ conformed_element = self.conform_dict(element_dict=element, file_data=file_data)
42
+ writer.write(row=conformed_element)
43
+ writer.f.flush()
44
+
45
+ def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
46
+ elements_contents = get_json_data(path=input_file)
47
+
48
+ conformed_elements = [
49
+ self.conform_dict(element_dict=element, file_data=file_data)
50
+ for element in elements_contents
51
+ ]
52
+ write_data(path=output_file, data=conformed_elements)
53
+
54
+ def run(
55
+ self,
56
+ elements_filepath: Path,
57
+ file_data: FileData,
58
+ output_dir: Path,
59
+ output_filename: str,
60
+ **kwargs: Any,
61
+ ) -> Path:
62
+ output_file = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
63
+ if elements_filepath.suffix == ".ndjson":
64
+ self.stream_update(
65
+ input_file=elements_filepath, output_file=output_file, file_data=file_data
66
+ )
67
+ elif elements_filepath.suffix == ".json":
68
+ self.process_whole(
69
+ input_file=elements_filepath, output_file=output_file, file_data=file_data
70
+ )
71
+ else:
72
+ raise ValueError(f"Unsupported file extension: {elements_filepath}")
73
+ return output_file
74
+
75
+ async def run_async(
76
+ self,
77
+ elements_filepath: Path,
78
+ file_data: FileData,
79
+ output_dir: Path,
80
+ output_filename: str,
81
+ **kwargs: Any,
82
+ ) -> Path:
83
+ return self.run(
84
+ elements_filepath=elements_filepath,
85
+ output_dir=output_dir,
86
+ output_filename=output_filename,
87
+ file_data=file_data,
88
+ **kwargs,
89
+ )
@@ -0,0 +1,67 @@
1
+ from abc import ABC
2
+ from dataclasses import dataclass
3
+ from pathlib import Path
4
+ from typing import Any, TypeVar
5
+
6
+ from pydantic import BaseModel
7
+
8
+ from unstructured_ingest.data_types.file_data import FileData
9
+ from unstructured_ingest.interfaces import BaseConnector, BaseProcess
10
+ from unstructured_ingest.utils.data_prep import get_json_data
11
+
12
+
13
+ class UploaderConfig(BaseModel):
14
+ pass
15
+
16
+
17
+ UploaderConfigT = TypeVar("UploaderConfigT", bound=UploaderConfig)
18
+
19
+
20
+ @dataclass
21
+ class UploadContent:
22
+ path: Path
23
+ file_data: FileData
24
+
25
+
26
+ @dataclass
27
+ class Uploader(BaseProcess, BaseConnector, ABC):
28
+ upload_config: UploaderConfigT
29
+ connector_type: str
30
+
31
+ def is_async(self) -> bool:
32
+ return False
33
+
34
+ def is_batch(self) -> bool:
35
+ return False
36
+
37
+ def run_batch(self, contents: list[UploadContent], **kwargs: Any) -> None:
38
+ raise NotImplementedError()
39
+
40
+ def create_destination(
41
+ self, destination_name: str = "unstructuredautocreated", **kwargs: Any
42
+ ) -> bool:
43
+ # Update the uploader config if needed with a new destination that gets created.
44
+ # Return a flag on if anything was created or not.
45
+ return False
46
+
47
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
48
+ data = get_json_data(path=path)
49
+ self.run_data(data=data, file_data=file_data, **kwargs)
50
+
51
+ async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
52
+ data = get_json_data(path=path)
53
+ await self.run_data_async(data=data, file_data=file_data, **kwargs)
54
+
55
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
56
+ raise NotImplementedError()
57
+
58
+ async def run_data_async(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
59
+ return self.run_data(data=data, file_data=file_data, **kwargs)
60
+
61
+
62
+ @dataclass
63
+ class VectorDBUploader(Uploader, ABC):
64
+ def create_destination(
65
+ self, vector_length: int, destination_name: str = "unstructuredautocreated", **kwargs: Any
66
+ ) -> bool:
67
+ return False
@@ -0,0 +1,39 @@
1
+ import logging
2
+
3
+ logger = logging.getLogger("unstructured_ingest")
4
+
5
+
6
+ def remove_root_handlers(logger: logging.Logger) -> None:
7
+ # NOTE(robinson): in some environments such as Google Colab, there is a root handler
8
+ # that doesn't not mask secrets, meaning sensitive info such as api keys appear in logs.
9
+ # Removing these when they exist prevents this behavior
10
+ if logger.root.hasHandlers():
11
+ for handler in logger.root.handlers:
12
+ logger.root.removeHandler(handler)
13
+
14
+
15
+ def ingest_log_streaming_init(level: int) -> None:
16
+ handler = logging.StreamHandler()
17
+ handler.name = "ingest_log_handler"
18
+ formatter = logging.Formatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")
19
+ handler.setFormatter(formatter)
20
+
21
+ # Only want to add the handler once
22
+ if "ingest_log_handler" not in [h.name for h in logger.handlers]:
23
+ logger.addHandler(handler)
24
+
25
+ remove_root_handlers(logger)
26
+ logger.setLevel(level)
27
+
28
+
29
+ def make_default_logger(level: int) -> logging.Logger:
30
+ """Return a custom logger."""
31
+ logger = logging.getLogger("unstructured_ingest")
32
+ handler = logging.StreamHandler()
33
+ handler.name = "ingest_log_handler"
34
+ formatter = logging.Formatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")
35
+ handler.setFormatter(formatter)
36
+ logger.addHandler(handler)
37
+ logger.setLevel(level)
38
+ remove_root_handlers(logger)
39
+ return logger
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env python3
2
+ from unstructured_ingest.cli.cli import get_cmd
3
+
4
+
5
+ def main():
6
+ ingest_cmd = get_cmd()
7
+ ingest_cmd()
8
+
9
+
10
+ if __name__ == "__main__":
11
+ main()
@@ -0,0 +1,128 @@
1
+ import logging
2
+ import os
3
+ from dataclasses import dataclass, field
4
+ from typing import Callable, ClassVar, Optional, Protocol, Sequence
5
+
6
+ from opentelemetry import trace
7
+ from opentelemetry.context import attach, get_current
8
+ from opentelemetry.propagate import extract, inject
9
+ from opentelemetry.sdk.resources import SERVICE_NAME, Resource
10
+ from opentelemetry.sdk.trace import ReadableSpan, Tracer, TracerProvider
11
+ from opentelemetry.sdk.trace.export import (
12
+ ConsoleSpanExporter,
13
+ SimpleSpanProcessor,
14
+ SpanExportResult,
15
+ )
16
+
17
+ from unstructured_ingest.logger import logger
18
+
19
+
20
+ class AddTraceCallable(Protocol):
21
+ def __call__(self, provider: TracerProvider) -> None:
22
+ pass
23
+
24
+
25
+ class LogSpanExporter(ConsoleSpanExporter):
26
+ def __init__(self, log_out: Callable = logger.info, **kwargs):
27
+ self.log_out = log_out
28
+ super().__init__(**kwargs)
29
+
30
+ def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult:
31
+ for span in spans:
32
+ self.log_out(self.formatter(span))
33
+ return SpanExportResult.SUCCESS
34
+
35
+
36
+ def get_log_out() -> Callable:
37
+ level_names_mapping = {
38
+ "CRITICAL": logging.CRITICAL,
39
+ "FATAL": logging.FATAL,
40
+ "ERROR": logging.ERROR,
41
+ "WARN": logging.WARNING,
42
+ "WARNING": logging.WARNING,
43
+ "INFO": logging.INFO,
44
+ "DEBUG": logging.DEBUG,
45
+ "NOTSET": logging.NOTSET,
46
+ }
47
+ log_level = os.getenv("OTEL_LOG_LEVEL", "DEBUG").upper()
48
+ log_level_int = level_names_mapping.get(log_level, logging.DEBUG)
49
+ return lambda message: logger.log(log_level_int, message)
50
+
51
+
52
+ @dataclass
53
+ class OtelHandler:
54
+ otel_endpoint: Optional[str] = None
55
+ service_name: str = "unstructured-ingest"
56
+ trace_provider: TracerProvider = field(init=False)
57
+ log_out: Callable = field(default=get_log_out())
58
+ trace_context_key: ClassVar[str] = "_trace_context"
59
+
60
+ def init_trace(self):
61
+ # Should only be done once
62
+ resource = Resource(attributes={SERVICE_NAME: self.service_name})
63
+ trace_provider = self.init_trace_provider(resource=resource)
64
+ trace.set_tracer_provider(trace_provider)
65
+
66
+ @staticmethod
67
+ def set_attributes(span, attributes_dict):
68
+ if attributes_dict:
69
+ for att in attributes_dict:
70
+ span.set_attribute(att, attributes_dict[att])
71
+
72
+ @staticmethod
73
+ def inject_context() -> dict:
74
+ trace_context = {}
75
+ current_context = get_current()
76
+ inject(trace_context, current_context)
77
+ return trace_context
78
+
79
+ @staticmethod
80
+ def attach_context(trace_context: dict) -> object:
81
+ extracted_context = extract(trace_context)
82
+ return attach(extracted_context)
83
+
84
+ def get_otel_endpoint(self) -> Optional[str]:
85
+ if otel_endpoint := self.otel_endpoint:
86
+ return otel_endpoint
87
+ if otlp_endpoint := os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT"):
88
+ return otlp_endpoint
89
+ if otlp_traces_endpoint := os.getenv("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"):
90
+ return otlp_traces_endpoint
91
+ return None
92
+
93
+ def _add_console_trace_processor(self, provider: TracerProvider) -> None:
94
+ def custom_formatter(span: ReadableSpan) -> str:
95
+ duration = (span.end_time - span.start_time) / 1e9
96
+ s = f"{span.name} finished in {duration}s"
97
+ if span.attributes:
98
+ attributes_str = ", ".join([f"{k}={v}" for k, v in span.attributes.items()])
99
+ s += f", attributes: {attributes_str}"
100
+ return s
101
+
102
+ tracer_exporter = LogSpanExporter(formatter=custom_formatter, log_out=self.log_out)
103
+ processor = SimpleSpanProcessor(tracer_exporter)
104
+ provider.add_span_processor(span_processor=processor)
105
+
106
+ def _add_otel_trace_processor(self, provider: TracerProvider) -> None:
107
+ otel_endpoint = self.get_otel_endpoint()
108
+ if not otel_endpoint:
109
+ return None
110
+ from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
111
+
112
+ logger.debug(f"adding otel exported at {otel_endpoint}")
113
+ trace_exporter = OTLPSpanExporter()
114
+ processor = SimpleSpanProcessor(trace_exporter)
115
+ provider.add_span_processor(processor)
116
+
117
+ def init_trace_provider(self, resource: Resource) -> TracerProvider:
118
+ trace_provider = TracerProvider(resource=resource)
119
+ add_fns: list[AddTraceCallable] = [
120
+ self._add_otel_trace_processor,
121
+ self._add_console_trace_processor,
122
+ ]
123
+ for add_fn in add_fns:
124
+ add_fn(provider=trace_provider)
125
+ return trace_provider
126
+
127
+ def get_tracer(self) -> Tracer:
128
+ return trace.get_tracer(self.service_name)
File without changes
@@ -0,0 +1,211 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import logging
5
+ import multiprocessing as mp
6
+ import shutil
7
+ from abc import ABC, abstractmethod
8
+ from concurrent.futures import ThreadPoolExecutor
9
+ from dataclasses import dataclass
10
+ from pathlib import Path
11
+ from typing import Any, Awaitable, Callable, Optional, TypeVar
12
+
13
+ from tqdm import tqdm
14
+ from tqdm.asyncio import tqdm as tqdm_asyncio
15
+
16
+ from unstructured_ingest.interfaces import BaseProcess, ProcessorConfig, Uploader
17
+ from unstructured_ingest.logger import logger, make_default_logger
18
+ from unstructured_ingest.otel import OtelHandler
19
+ from unstructured_ingest.pipeline.otel import instrument
20
+
21
+ BaseProcessT = TypeVar("BaseProcessT", bound=BaseProcess)
22
+ iterable_input = list[dict[str, Any]]
23
+
24
+
25
+ @dataclass
26
+ class PipelineStep(ABC):
27
+ process: BaseProcessT
28
+ context: ProcessorConfig
29
+ identifier: str
30
+
31
+ def __str__(self):
32
+ return self.identifier
33
+
34
+ def process_serially(self, iterable: iterable_input) -> Any:
35
+ logger.info("processing content serially")
36
+ if iterable:
37
+ if len(iterable) == 1:
38
+ return [self.run(**iterable[0])]
39
+ if self.context.tqdm:
40
+ return [self.run(**it) for it in tqdm(iterable, desc=self.identifier)]
41
+ return [self.run(**it) for it in iterable]
42
+ return [self.run()]
43
+
44
+ async def _process_async(self, iterable: iterable_input) -> Any:
45
+ if iterable:
46
+ if len(iterable) == 1:
47
+ return [await self.run_async(**iterable[0])]
48
+ if self.context.tqdm:
49
+ return await tqdm_asyncio.gather(
50
+ *[self.run_async(**i) for i in iterable], desc=self.identifier
51
+ )
52
+ return await asyncio.gather(*[self.run_async(**i) for i in iterable])
53
+ return [await self.run_async()]
54
+
55
+ def process_async(self, iterable: iterable_input) -> Any:
56
+ logger.info("processing content async")
57
+ return self.asyncio_run(fn=self._process_async, iterable=iterable)
58
+
59
+ def asyncio_run(
60
+ self, fn: Callable[[Any, Any], Awaitable[Any]], *args: Any, **kwargs: Any
61
+ ) -> Any:
62
+ current_loop = asyncio._get_running_loop()
63
+ if current_loop is None:
64
+ return asyncio.run(fn(*args, **kwargs))
65
+ with ThreadPoolExecutor(thread_name_prefix="asyncio") as thread_pool:
66
+ logger.warning(
67
+ f"async code being run in dedicated thread pool "
68
+ f"to not conflict with existing event loop: {current_loop}"
69
+ )
70
+
71
+ def wrapped():
72
+ return asyncio.run(fn(*args, **kwargs))
73
+
74
+ future = thread_pool.submit(wrapped)
75
+ return future.result()
76
+
77
+ def process_multiprocess(self, iterable: iterable_input) -> Any:
78
+ logger.info("processing content across processes")
79
+
80
+ if iterable:
81
+ if len(iterable) == 1:
82
+ return self.process_serially(iterable)
83
+ if self.context.num_processes == 1:
84
+ return self.process_serially(iterable)
85
+ with mp.Pool(
86
+ processes=self.context.num_processes,
87
+ initializer=self._init_mp,
88
+ initargs=(
89
+ logging.DEBUG if self.context.verbose else logging.INFO,
90
+ self.context.otel_endpoint,
91
+ ),
92
+ ) as pool:
93
+ otel_context = OtelHandler.inject_context()
94
+ for iter in iterable:
95
+ iter[OtelHandler.trace_context_key] = otel_context
96
+ if self.context.tqdm:
97
+ return list(
98
+ tqdm(
99
+ pool.imap_unordered(func=self._wrap_mp, iterable=iterable),
100
+ total=len(iterable),
101
+ desc=self.identifier,
102
+ )
103
+ )
104
+ return pool.map(self._wrap_mp, iterable)
105
+ return [self.run()]
106
+
107
+ def _wrap_mp(self, input_kwargs: dict) -> Any:
108
+ # Allow mapping of kwargs via multiprocessing map()
109
+ return self.run(**input_kwargs)
110
+
111
+ def _init_mp(self, log_level: int, endpoint: Optional[str] = None) -> None:
112
+ # Init logger for each spawned process when using multiprocessing pool
113
+ make_default_logger(level=log_level)
114
+ otel_handler = OtelHandler(otel_endpoint=endpoint, log_out=logger.debug)
115
+ otel_handler.init_trace()
116
+
117
+ @instrument()
118
+ def __call__(self, iterable: Optional[iterable_input] = None) -> Any:
119
+ iterable = iterable or []
120
+ if iterable:
121
+ logger.info(
122
+ f"calling {self.__class__.__name__} with {len(iterable)} docs", # type: ignore
123
+ )
124
+ else:
125
+ logger.info(f"calling {self.__class__.__name__} with no inputs")
126
+ if self.context.async_supported and self.process.is_async():
127
+ return self.process_async(iterable=iterable)
128
+ if self.context.mp_supported:
129
+ return self.process_multiprocess(iterable=iterable)
130
+ return self.process_serially(iterable=iterable)
131
+
132
+ def _run(self, fn: Callable, **kwargs: Any) -> Optional[Any]:
133
+ return self.asyncio_run(fn=self.run_async, _fn=fn, **kwargs)
134
+
135
+ async def _run_async(self, fn: Callable, **kwargs: Any) -> Optional[Any]:
136
+ raise NotImplementedError
137
+
138
+ def run(self, _fn: Callable[..., Any] | None = None, **kwargs: Any) -> Optional[Any]:
139
+ kwargs = kwargs.copy()
140
+ otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=logger.debug)
141
+ tracer = otel_handler.get_tracer()
142
+ if trace_context := kwargs.pop(otel_handler.trace_context_key, {}):
143
+ otel_handler.attach_context(trace_context=trace_context)
144
+ attributes = {}
145
+ if file_data_path := kwargs.get("file_data_path"):
146
+ attributes["file_id"] = Path(file_data_path).stem
147
+ try:
148
+ with tracer.start_as_current_span(self.identifier, record_exception=True) as span:
149
+ otel_handler.set_attributes(span, attributes)
150
+ fn = _fn or self.process.run
151
+ return self._run(fn=fn, **kwargs)
152
+ except Exception as e:
153
+ logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
154
+ if "file_data_path" in kwargs:
155
+ self.context.status[kwargs["file_data_path"]] = {self.identifier: str(e)}
156
+ if self.context.raise_on_error:
157
+ raise e
158
+ return None
159
+
160
+ async def run_async(self, _fn: Optional[Callable] = None, **kwargs: Any) -> Optional[Any]:
161
+ otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=logger.debug)
162
+ try:
163
+ attributes = {}
164
+ if file_data_path := kwargs.get("file_data_path"):
165
+ attributes["file_id"] = Path(file_data_path).stem
166
+ with otel_handler.get_tracer().start_as_current_span(
167
+ self.identifier, record_exception=True
168
+ ) as span:
169
+ otel_handler.set_attributes(span, attributes)
170
+ fn = _fn or self.process.run_async
171
+ return await self._run_async(fn=fn, **kwargs)
172
+ except Exception as e:
173
+ logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
174
+ if "file_data_path" in kwargs:
175
+ self.context.status[kwargs["file_data_path"]] = {self.identifier: str(e)}
176
+ if self.context.raise_on_error:
177
+ raise e
178
+ return None
179
+
180
+ @property
181
+ def cache_dir(self) -> Path:
182
+ return Path(self.context.work_dir) / self.identifier
183
+
184
+ def delete_cache(self):
185
+ if self.context.iter_delete and self.cache_dir.exists():
186
+ cache_dir = self.cache_dir
187
+ logger.info(f"deleting {self.identifier} cache dir {cache_dir}")
188
+ shutil.rmtree(cache_dir)
189
+
190
+
191
+ @dataclass
192
+ class BatchPipelineStep(PipelineStep, ABC):
193
+ process: Uploader
194
+
195
+ def __call__(self, iterable: Optional[iterable_input] = None) -> Any:
196
+ if self.context.mp_supported and self.process.is_batch():
197
+ return self.run_batch(contents=iterable)
198
+ super().__call__(iterable=iterable)
199
+
200
+ @abstractmethod
201
+ def _run_batch(self, contents: iterable_input, **kwargs) -> Any:
202
+ pass
203
+
204
+ def run_batch(self, contents: iterable_input, **kwargs) -> Any:
205
+ try:
206
+ return self._run_batch(contents=contents, **kwargs)
207
+ except Exception as e:
208
+ self.context.status[self.identifier] = {"step_error": str(e)}
209
+ if self.context.raise_on_error:
210
+ raise e
211
+ return None
@@ -0,0 +1,32 @@
1
+ from functools import wraps
2
+ from typing import Callable, Optional
3
+
4
+ from unstructured_ingest.logger import logger
5
+ from unstructured_ingest.otel import OtelHandler
6
+
7
+
8
+ def instrument(
9
+ span_name: Optional[str] = None,
10
+ record_exception: bool = True,
11
+ attributes: dict[str, str] = None,
12
+ log_out: Callable = logger.info,
13
+ ) -> Callable[[Callable], Callable]:
14
+ def span_decorator(func: Callable) -> Callable:
15
+ def get_name(self) -> str:
16
+ if span_name:
17
+ return span_name
18
+ return f"{self.identifier} step"
19
+
20
+ @wraps(func)
21
+ def wrap_with_span(self, *args, **kwargs):
22
+ name = get_name(self=self)
23
+ otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=log_out)
24
+ with otel_handler.get_tracer().start_as_current_span(
25
+ name, record_exception=record_exception
26
+ ) as span:
27
+ otel_handler.set_attributes(span, attributes)
28
+ return func(self, *args, **kwargs)
29
+
30
+ return wrap_with_span
31
+
32
+ return span_decorator