unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,74 @@
1
+ import json
2
+ import re
3
+ from datetime import datetime
4
+ from typing import Any, Union
5
+
6
+ from dateutil import parser
7
+
8
+ from unstructured_ingest.logger import logger
9
+
10
+
11
+ def json_to_dict(json_string: str) -> Union[str, dict[str, Any]]:
12
+ """Helper function attempts to deserialize json string to a dictionary."""
13
+ try:
14
+ return json.loads(json_string)
15
+ except json.JSONDecodeError:
16
+ # Not necessary an error if it is a path or malformed json
17
+ pass
18
+ try:
19
+ # This is common when single quotes are used instead of double quotes
20
+ return json.loads(json_string.replace("'", '"'))
21
+ except json.JSONDecodeError:
22
+ # Not necessary an error if it is a path
23
+ pass
24
+ return json_string
25
+
26
+
27
+ def ensure_isoformat_datetime(timestamp: Union[datetime, str]) -> str:
28
+ """
29
+ Ensures that the input value is converted to an ISO format datetime string.
30
+ Handles both datetime objects and strings.
31
+ """
32
+ if isinstance(timestamp, datetime):
33
+ return timestamp.isoformat()
34
+ elif isinstance(timestamp, str):
35
+ try:
36
+ # Parse the datetime string in various formats
37
+ dt = parser.parse(timestamp)
38
+ return dt.isoformat()
39
+ except ValueError as e:
40
+ raise ValueError(f"String '{timestamp}' could not be parsed as a datetime.") from e
41
+ else:
42
+ raise TypeError(f"Expected input type datetime or str, but got {type(timestamp)}.")
43
+
44
+
45
+ def truncate_string_bytes(string: str, max_bytes: int, encoding: str = "utf-8") -> str:
46
+ """
47
+ Truncates a string to a specified maximum number of bytes.
48
+ """
49
+ encoded_string = str(string).encode(encoding)
50
+ if len(encoded_string) <= max_bytes:
51
+ return string
52
+ return encoded_string[:max_bytes].decode(encoding, errors="ignore")
53
+
54
+
55
+ def fix_unescaped_unicode(text: str, encoding: str = "utf-8") -> str:
56
+ """
57
+ Fix unescaped Unicode sequences in text.
58
+ """
59
+ try:
60
+ _text: str = json.dumps(text)
61
+
62
+ # Pattern to match unescaped Unicode sequences like \\uXXXX
63
+ pattern = r"\\\\u([0-9A-Fa-f]{4})"
64
+ # Replace with properly escaped Unicode sequences \uXXXX
65
+ _text = re.sub(pattern, r"\\u\1", _text)
66
+ _text = json.loads(_text)
67
+
68
+ # Encode the text to check for encoding errors
69
+ _text.encode(encoding)
70
+ return _text
71
+ except Exception as e:
72
+ # Return original text if encoding fails
73
+ logger.warning(f"Failed to fix unescaped Unicode sequences: {e}", exc_info=True)
74
+ return text
@@ -0,0 +1,80 @@
1
+ from typing import TYPE_CHECKING, Any
2
+
3
+ from unstructured_ingest.utils.data_prep import flatten_dict
4
+ from unstructured_ingest.utils.dep_check import requires_dependencies
5
+
6
+ if TYPE_CHECKING:
7
+ from pandas import DataFrame
8
+
9
+
10
+ @requires_dependencies(["pandas"])
11
+ def get_default_pandas_dtypes() -> dict[str, Any]:
12
+ import pandas as pd
13
+
14
+ return {
15
+ "text": pd.StringDtype(), # type: ignore
16
+ "type": pd.StringDtype(), # type: ignore
17
+ "element_id": pd.StringDtype(), # type: ignore
18
+ "filename": pd.StringDtype(), # Optional[str] # type: ignore
19
+ "filetype": pd.StringDtype(), # Optional[str] # type: ignore
20
+ "file_directory": pd.StringDtype(), # Optional[str] # type: ignore
21
+ "last_modified": pd.StringDtype(), # Optional[str] # type: ignore
22
+ "attached_to_filename": pd.StringDtype(), # Optional[str] # type: ignore
23
+ "parent_id": pd.StringDtype(), # Optional[str], # type: ignore
24
+ "category_depth": "Int64", # Optional[int]
25
+ "image_path": pd.StringDtype(), # Optional[str] # type: ignore
26
+ "languages": object, # Optional[list[str]]
27
+ "page_number": "Int64", # Optional[int]
28
+ "page_name": pd.StringDtype(), # Optional[str] # type: ignore
29
+ "url": pd.StringDtype(), # Optional[str] # type: ignore
30
+ "link_urls": pd.StringDtype(), # Optional[str] # type: ignore
31
+ "link_texts": object, # Optional[list[str]]
32
+ "links": object,
33
+ "sent_from": object, # Optional[list[str]],
34
+ "sent_to": object, # Optional[list[str]]
35
+ "subject": pd.StringDtype(), # Optional[str] # type: ignore
36
+ "section": pd.StringDtype(), # Optional[str] # type: ignore
37
+ "header_footer_type": pd.StringDtype(), # Optional[str] # type: ignore
38
+ "emphasized_text_contents": object, # Optional[list[str]]
39
+ "emphasized_text_tags": object, # Optional[list[str]]
40
+ "text_as_html": pd.StringDtype(), # Optional[str] # type: ignore
41
+ "regex_metadata": object,
42
+ "max_characters": "Int64", # Optional[int]
43
+ "is_continuation": "boolean", # Optional[bool]
44
+ "detection_class_prob": float, # Optional[float],
45
+ "sender": pd.StringDtype(), # type: ignore
46
+ "coordinates_points": object,
47
+ "coordinates_system": pd.StringDtype(), # type: ignore
48
+ "coordinates_layout_width": float,
49
+ "coordinates_layout_height": float,
50
+ "data_source_url": pd.StringDtype(), # Optional[str] # type: ignore
51
+ "data_source_version": pd.StringDtype(), # Optional[str] # type: ignore
52
+ "data_source_record_locator": object,
53
+ "data_source_date_created": pd.StringDtype(), # Optional[str] # type: ignore
54
+ "data_source_date_modified": pd.StringDtype(), # Optional[str] # type: ignore
55
+ "data_source_date_processed": pd.StringDtype(), # Optional[str] # type: ignore
56
+ "data_source_permissions_data": object,
57
+ "embeddings": object,
58
+ "regex_metadata_key": object,
59
+ }
60
+
61
+
62
+ def convert_to_pandas_dataframe(
63
+ elements_dict: list[dict[str, Any]],
64
+ drop_empty_cols: bool = False,
65
+ ) -> "DataFrame":
66
+ import pandas as pd
67
+
68
+ # Flatten metadata if it hasn't already been flattened
69
+ for d in elements_dict:
70
+ if metadata := d.pop("metadata", None):
71
+ d.update(flatten_dict(metadata, keys_to_omit=["data_source_record_locator"]))
72
+
73
+ df = pd.DataFrame.from_dict(
74
+ elements_dict,
75
+ )
76
+ dt = {k: v for k, v in get_default_pandas_dtypes().items() if k in df.columns}
77
+ df = df.astype(dt)
78
+ if drop_empty_cols:
79
+ df.dropna(axis=1, how="all", inplace=True)
80
+ return df
@@ -0,0 +1,15 @@
1
+ import os
2
+ import ssl
3
+
4
+ import certifi
5
+
6
+
7
+ def ssl_context_with_optional_ca_override():
8
+ """
9
+ # https://www.python-httpx.org/advanced/ssl/#working-with-ssl_cert_file-and-ssl_cert_dir
10
+ # We choose REQUESTS_CA_BUNDLE because that works with many other Python packages.
11
+ """
12
+ return ssl.create_default_context(
13
+ cafile=os.environ.get("REQUESTS_CA_BUNDLE", certifi.where()),
14
+ capath=os.environ.get("REQUESTS_CA_BUNDLE"),
15
+ )
@@ -0,0 +1,235 @@
1
+ Metadata-Version: 2.4
2
+ Name: unstructured_ingest
3
+ Version: 1.2.32
4
+ Summary: Local ETL data pipeline to get data RAG ready
5
+ Author-email: Unstructured Technologies <devops@unstructuredai.io>
6
+ License-Expression: Apache-2.0
7
+ License-File: LICENSE.md
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Intended Audience :: Education
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: Apache Software License
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Requires-Python: <3.13,>=3.10
20
+ Requires-Dist: certifi>=2025.7.14
21
+ Requires-Dist: click
22
+ Requires-Dist: opentelemetry-sdk
23
+ Requires-Dist: pydantic>=2.7
24
+ Requires-Dist: python-dateutil
25
+ Requires-Dist: tqdm
26
+ Provides-Extra: airtable
27
+ Requires-Dist: pandas; extra == 'airtable'
28
+ Requires-Dist: pyairtable; extra == 'airtable'
29
+ Provides-Extra: astradb
30
+ Requires-Dist: astrapy>2.0.0; extra == 'astradb'
31
+ Provides-Extra: azure
32
+ Requires-Dist: adlfs; extra == 'azure'
33
+ Requires-Dist: fsspec; extra == 'azure'
34
+ Provides-Extra: azure-ai-search
35
+ Requires-Dist: azure-search-documents; extra == 'azure-ai-search'
36
+ Provides-Extra: bedrock
37
+ Requires-Dist: aioboto3; extra == 'bedrock'
38
+ Requires-Dist: aiobotocore[boto3]!=2.24.2; extra == 'bedrock'
39
+ Requires-Dist: boto3; extra == 'bedrock'
40
+ Provides-Extra: biomed
41
+ Requires-Dist: bs4; extra == 'biomed'
42
+ Requires-Dist: requests; extra == 'biomed'
43
+ Provides-Extra: box
44
+ Requires-Dist: boxfs; extra == 'box'
45
+ Requires-Dist: fsspec; extra == 'box'
46
+ Provides-Extra: chroma
47
+ Requires-Dist: chromadb; extra == 'chroma'
48
+ Provides-Extra: clarifai
49
+ Requires-Dist: clarifai; extra == 'clarifai'
50
+ Provides-Extra: confluence
51
+ Requires-Dist: atlassian-python-api; extra == 'confluence'
52
+ Requires-Dist: requests; extra == 'confluence'
53
+ Provides-Extra: couchbase
54
+ Requires-Dist: couchbase; extra == 'couchbase'
55
+ Provides-Extra: databricks-delta-tables
56
+ Requires-Dist: databricks-sql-connector; extra == 'databricks-delta-tables'
57
+ Requires-Dist: pandas; extra == 'databricks-delta-tables'
58
+ Provides-Extra: databricks-volumes
59
+ Requires-Dist: databricks-sdk>=0.70.0; extra == 'databricks-volumes'
60
+ Provides-Extra: delta-table
61
+ Requires-Dist: boto3; extra == 'delta-table'
62
+ Requires-Dist: deltalake; extra == 'delta-table'
63
+ Requires-Dist: pandas; extra == 'delta-table'
64
+ Requires-Dist: pyarrow; extra == 'delta-table'
65
+ Requires-Dist: tenacity; extra == 'delta-table'
66
+ Provides-Extra: discord
67
+ Requires-Dist: discord-py; extra == 'discord'
68
+ Provides-Extra: doc
69
+ Requires-Dist: unstructured[doc]; extra == 'doc'
70
+ Provides-Extra: docx
71
+ Requires-Dist: unstructured[docx]; extra == 'docx'
72
+ Provides-Extra: dropbox
73
+ Requires-Dist: dropboxdrivefs; extra == 'dropbox'
74
+ Requires-Dist: fsspec; extra == 'dropbox'
75
+ Provides-Extra: duckdb
76
+ Requires-Dist: duckdb; extra == 'duckdb'
77
+ Requires-Dist: pandas; extra == 'duckdb'
78
+ Provides-Extra: elasticsearch
79
+ Requires-Dist: elasticsearch[async]<9.0.0; extra == 'elasticsearch'
80
+ Provides-Extra: epub
81
+ Requires-Dist: unstructured[epub]; extra == 'epub'
82
+ Provides-Extra: gcs
83
+ Requires-Dist: bs4; extra == 'gcs'
84
+ Requires-Dist: fsspec; extra == 'gcs'
85
+ Requires-Dist: gcsfs; extra == 'gcs'
86
+ Provides-Extra: github
87
+ Requires-Dist: pygithub>1.58.0; extra == 'github'
88
+ Requires-Dist: requests; extra == 'github'
89
+ Provides-Extra: gitlab
90
+ Requires-Dist: python-gitlab; extra == 'gitlab'
91
+ Provides-Extra: google-drive
92
+ Requires-Dist: google-api-python-client; extra == 'google-drive'
93
+ Requires-Dist: tenacity; extra == 'google-drive'
94
+ Provides-Extra: hubspot
95
+ Requires-Dist: hubspot-api-client; extra == 'hubspot'
96
+ Requires-Dist: urllib3; extra == 'hubspot'
97
+ Provides-Extra: huggingface
98
+ Requires-Dist: sentence-transformers; extra == 'huggingface'
99
+ Provides-Extra: ibm-watsonx-s3
100
+ Requires-Dist: httpx; extra == 'ibm-watsonx-s3'
101
+ Requires-Dist: pandas; extra == 'ibm-watsonx-s3'
102
+ Requires-Dist: pyarrow; extra == 'ibm-watsonx-s3'
103
+ Requires-Dist: pyiceberg; extra == 'ibm-watsonx-s3'
104
+ Requires-Dist: tenacity; extra == 'ibm-watsonx-s3'
105
+ Provides-Extra: image
106
+ Requires-Dist: unstructured[image]; extra == 'image'
107
+ Provides-Extra: jira
108
+ Requires-Dist: atlassian-python-api; extra == 'jira'
109
+ Provides-Extra: kafka
110
+ Requires-Dist: confluent-kafka; extra == 'kafka'
111
+ Provides-Extra: kdbai
112
+ Requires-Dist: kdbai-client>=1.4.0; extra == 'kdbai'
113
+ Requires-Dist: pandas; extra == 'kdbai'
114
+ Provides-Extra: lancedb
115
+ Requires-Dist: lancedb; extra == 'lancedb'
116
+ Provides-Extra: md
117
+ Requires-Dist: unstructured[md]; extra == 'md'
118
+ Provides-Extra: milvus
119
+ Requires-Dist: pymilvus; extra == 'milvus'
120
+ Provides-Extra: mixedbreadai
121
+ Requires-Dist: mixedbread; extra == 'mixedbreadai'
122
+ Provides-Extra: mongodb
123
+ Requires-Dist: pymongo; extra == 'mongodb'
124
+ Provides-Extra: msg
125
+ Requires-Dist: unstructured[msg]; extra == 'msg'
126
+ Provides-Extra: neo4j
127
+ Requires-Dist: cymple; extra == 'neo4j'
128
+ Requires-Dist: neo4j-rust-ext; extra == 'neo4j'
129
+ Requires-Dist: networkx; extra == 'neo4j'
130
+ Provides-Extra: notion
131
+ Requires-Dist: backoff; extra == 'notion'
132
+ Requires-Dist: htmlbuilder; extra == 'notion'
133
+ Requires-Dist: httpx; extra == 'notion'
134
+ Requires-Dist: notion-client; extra == 'notion'
135
+ Provides-Extra: octoai
136
+ Requires-Dist: openai; extra == 'octoai'
137
+ Requires-Dist: tiktoken; extra == 'octoai'
138
+ Provides-Extra: odt
139
+ Requires-Dist: unstructured[odt]; extra == 'odt'
140
+ Provides-Extra: onedrive
141
+ Requires-Dist: msal; extra == 'onedrive'
142
+ Requires-Dist: office365-rest-python-client; extra == 'onedrive'
143
+ Requires-Dist: requests; extra == 'onedrive'
144
+ Provides-Extra: openai
145
+ Requires-Dist: openai; extra == 'openai'
146
+ Requires-Dist: tiktoken; extra == 'openai'
147
+ Provides-Extra: opensearch
148
+ Requires-Dist: boto3>=1.26.0; extra == 'opensearch'
149
+ Requires-Dist: botocore>=1.29.0; extra == 'opensearch'
150
+ Requires-Dist: opensearch-py<3.0.0,>=2.4.0; extra == 'opensearch'
151
+ Provides-Extra: org
152
+ Requires-Dist: unstructured[org]; extra == 'org'
153
+ Provides-Extra: outlook
154
+ Requires-Dist: msal; extra == 'outlook'
155
+ Requires-Dist: office365-rest-python-client; extra == 'outlook'
156
+ Provides-Extra: pdf
157
+ Requires-Dist: unstructured[pdf]; extra == 'pdf'
158
+ Provides-Extra: pinecone
159
+ Requires-Dist: pinecone; extra == 'pinecone'
160
+ Provides-Extra: postgres
161
+ Requires-Dist: pandas; extra == 'postgres'
162
+ Requires-Dist: psycopg2-binary; extra == 'postgres'
163
+ Provides-Extra: ppt
164
+ Requires-Dist: unstructured[ppt]; extra == 'ppt'
165
+ Provides-Extra: pptx
166
+ Requires-Dist: unstructured[pptx]; extra == 'pptx'
167
+ Provides-Extra: qdrant
168
+ Requires-Dist: qdrant-client; extra == 'qdrant'
169
+ Provides-Extra: reddit
170
+ Requires-Dist: praw; extra == 'reddit'
171
+ Provides-Extra: redis
172
+ Requires-Dist: redis<=5.3.0; extra == 'redis'
173
+ Provides-Extra: remote
174
+ Requires-Dist: unstructured-client>=0.30.0; extra == 'remote'
175
+ Provides-Extra: rst
176
+ Requires-Dist: unstructured[rst]; extra == 'rst'
177
+ Provides-Extra: rtf
178
+ Requires-Dist: unstructured[rtf]; extra == 'rtf'
179
+ Provides-Extra: s3
180
+ Requires-Dist: fsspec; extra == 's3'
181
+ Requires-Dist: s3fs; extra == 's3'
182
+ Provides-Extra: salesforce
183
+ Requires-Dist: simple-salesforce; extra == 'salesforce'
184
+ Provides-Extra: sftp
185
+ Requires-Dist: fsspec; extra == 'sftp'
186
+ Requires-Dist: paramiko; extra == 'sftp'
187
+ Provides-Extra: sharepoint
188
+ Requires-Dist: msal; extra == 'sharepoint'
189
+ Requires-Dist: office365-rest-python-client; extra == 'sharepoint'
190
+ Requires-Dist: requests; extra == 'sharepoint'
191
+ Provides-Extra: singlestore
192
+ Requires-Dist: pandas; extra == 'singlestore'
193
+ Requires-Dist: singlestoredb; extra == 'singlestore'
194
+ Provides-Extra: slack
195
+ Requires-Dist: slack-sdk[optional]; extra == 'slack'
196
+ Provides-Extra: snowflake
197
+ Requires-Dist: pandas; extra == 'snowflake'
198
+ Requires-Dist: psycopg2-binary; extra == 'snowflake'
199
+ Requires-Dist: snowflake-connector-python; extra == 'snowflake'
200
+ Provides-Extra: teradata
201
+ Requires-Dist: pandas; extra == 'teradata'
202
+ Requires-Dist: teradatasql; extra == 'teradata'
203
+ Provides-Extra: togetherai
204
+ Requires-Dist: together; extra == 'togetherai'
205
+ Provides-Extra: tsv
206
+ Requires-Dist: unstructured[tsv]; extra == 'tsv'
207
+ Provides-Extra: vastdb
208
+ Requires-Dist: ibis; extra == 'vastdb'
209
+ Requires-Dist: pandas; extra == 'vastdb'
210
+ Requires-Dist: pyarrow; extra == 'vastdb'
211
+ Requires-Dist: vastdb; extra == 'vastdb'
212
+ Provides-Extra: vectara
213
+ Requires-Dist: aiofiles; extra == 'vectara'
214
+ Requires-Dist: httpx; extra == 'vectara'
215
+ Requires-Dist: requests; extra == 'vectara'
216
+ Provides-Extra: vertexai
217
+ Requires-Dist: vertexai; extra == 'vertexai'
218
+ Provides-Extra: voyageai
219
+ Requires-Dist: langchain-core<1.0.0,>=0.3.81; extra == 'voyageai'
220
+ Requires-Dist: voyageai; extra == 'voyageai'
221
+ Provides-Extra: weaviate
222
+ Requires-Dist: weaviate-client; extra == 'weaviate'
223
+ Provides-Extra: wikipedia
224
+ Requires-Dist: wikipedia; extra == 'wikipedia'
225
+ Provides-Extra: xlsx
226
+ Requires-Dist: unstructured[xlsx]; extra == 'xlsx'
227
+ Provides-Extra: zendesk
228
+ Requires-Dist: aiofiles; extra == 'zendesk'
229
+ Requires-Dist: bs4; extra == 'zendesk'
230
+ Requires-Dist: httpx; extra == 'zendesk'
231
+ Description-Content-Type: text/markdown
232
+
233
+ # Unstructured Ingest
234
+
235
+ For details, see the [Unstructured Ingest overview](https://docs.unstructured.io/ingestion/overview) in the Unstructured documentation.