unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,72 @@
1
+ import os
2
+ import sys
3
+ import tarfile
4
+ import zipfile
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ from unstructured_ingest.logger import logger
9
+ from unstructured_ingest.utils.filesystem import mkdir_concurrent_safe
10
+
11
+ ZIP_FILE_EXT = [".zip"]
12
+ TAR_FILE_EXT = [".tar", ".tar.gz", ".tgz"]
13
+
14
+
15
+ def uncompress_file(filename: str, path: Optional[str] = None) -> str:
16
+ """
17
+ Takes in a compressed zip or tar file and decompresses it
18
+ """
19
+ # Create path if it doesn't already exist
20
+ if path:
21
+ mkdir_concurrent_safe(Path(path))
22
+
23
+ if any(filename.endswith(ext) for ext in ZIP_FILE_EXT):
24
+ return uncompress_zip_file(zip_filename=filename, path=path)
25
+ elif any(filename.endswith(ext) for ext in TAR_FILE_EXT):
26
+ return uncompress_tar_file(tar_filename=filename, path=path)
27
+ else:
28
+ raise ValueError(
29
+ "filename {} not a recognized compressed extension: {}".format(
30
+ filename,
31
+ ", ".join(ZIP_FILE_EXT + TAR_FILE_EXT),
32
+ ),
33
+ )
34
+
35
+
36
+ def uncompress_zip_file(zip_filename: str, path: Optional[str] = None) -> str:
37
+ head, tail = os.path.split(zip_filename)
38
+ for ext in ZIP_FILE_EXT:
39
+ if tail.endswith(ext):
40
+ tail = tail[: -(len(ext))]
41
+ break
42
+ path = path if path else os.path.join(head, f"{tail}-zip-uncompressed")
43
+ logger.info(f"extracting zip {zip_filename} -> {path}")
44
+ with zipfile.ZipFile(zip_filename) as zfile:
45
+ zfile.extractall(path=path)
46
+ return path
47
+
48
+
49
+ def uncompress_tar_file(tar_filename: str, path: Optional[str] = None) -> str:
50
+ head, tail = os.path.split(tar_filename)
51
+ for ext in TAR_FILE_EXT:
52
+ if tail.endswith(ext):
53
+ tail = tail[: -(len(ext))]
54
+ break
55
+
56
+ path = path if path else os.path.join(head, f"{tail}-tar-uncompressed")
57
+ logger.info(f"extracting tar {tar_filename} -> {path}")
58
+ # NOTE: "r:*" mode opens both compressed (e.g ".tar.gz") and uncompressed ".tar" archives
59
+ with tarfile.open(tar_filename, "r:*") as tfile:
60
+ # NOTE(robinson): Mitigate against malicious content being extracted from the tar file.
61
+ # This was added in Python 3.12
62
+ # Ref: https://docs.python.org/3/library/tarfile.html#extraction-filters
63
+ if sys.version_info >= (3, 12):
64
+ tfile.extraction_filter = tarfile.tar_filter
65
+ else:
66
+ logger.warning(
67
+ "Extraction filtering for tar files is available for Python 3.12 and above. "
68
+ "Consider upgrading your Python version to improve security. "
69
+ "See https://docs.python.org/3/library/tarfile.html#extraction-filters"
70
+ )
71
+ tfile.extractall(path=path)
72
+ return path
@@ -0,0 +1,2 @@
1
+ # Used to append to metadata for uploaders that store element-level data
2
+ RECORD_ID_LABEL = "record_id"
@@ -0,0 +1,216 @@
1
+ import itertools
2
+ import json
3
+ from datetime import datetime
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Any, Generator, Iterable, Optional, Sequence, TypeVar, cast
6
+ from uuid import NAMESPACE_DNS, uuid5
7
+
8
+ from unstructured_ingest.data_types.file_data import FileData
9
+ from unstructured_ingest.logger import logger
10
+ from unstructured_ingest.utils import ndjson
11
+ from unstructured_ingest.utils.dep_check import requires_dependencies
12
+
13
+ if TYPE_CHECKING:
14
+ from pandas import DataFrame
15
+
16
+ DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
17
+
18
+ T = TypeVar("T")
19
+ IterableT = Iterable[T]
20
+
21
+
22
+ def split_dataframe(df: "DataFrame", chunk_size: int = 100) -> Generator["DataFrame", None, None]:
23
+ num_chunks = len(df) // chunk_size + 1
24
+ for i in range(num_chunks):
25
+ yield df[i * chunk_size : (i + 1) * chunk_size]
26
+
27
+
28
+ def batch_generator(iterable: IterableT, batch_size: int = 100) -> IterableT:
29
+ """A helper function to break an iterable into batches of size batch_size."""
30
+ it = iter(iterable)
31
+ chunk = tuple(itertools.islice(it, batch_size))
32
+ while chunk:
33
+ yield chunk
34
+ chunk = tuple(itertools.islice(it, batch_size))
35
+
36
+
37
+ def generator_batching_wbytes(
38
+ iterable: IterableT,
39
+ batch_size_limit_bytes: Optional[int] = None,
40
+ max_batch_size: Optional[int] = None,
41
+ ) -> IterableT:
42
+ if not batch_size_limit_bytes and not max_batch_size:
43
+ return iterable
44
+ """A helper function to break an iterable into chunks of specified bytes."""
45
+ current_batch, current_batch_size = [], 0
46
+
47
+ for item in iterable:
48
+ item_size_bytes = len(json.dumps(item).encode("utf-8"))
49
+ if batch_size_limit_bytes and current_batch_size + item_size_bytes > batch_size_limit_bytes:
50
+ yield current_batch
51
+ current_batch, current_batch_size = [item], item_size_bytes
52
+ continue
53
+ if max_batch_size and len(current_batch) + 1 > max_batch_size:
54
+ yield current_batch
55
+ current_batch, current_batch_size = [item], item_size_bytes
56
+ continue
57
+
58
+ current_batch.append(item)
59
+ current_batch_size += item_size_bytes
60
+
61
+ if current_batch:
62
+ yield current_batch
63
+
64
+
65
+ def flatten_dict(
66
+ dictionary: dict[str, Any],
67
+ parent_key: str = "",
68
+ separator: str = "_",
69
+ flatten_lists: bool = False,
70
+ remove_none: bool = False,
71
+ keys_to_omit: Optional[Sequence[str]] = None,
72
+ ) -> dict[str, Any]:
73
+ """Flattens a nested dictionary into a single level dictionary.
74
+
75
+ keys_to_omit is a list of keys that don't get flattened. If omitting a nested key, format as
76
+ {parent_key}{separator}{key}. If flatten_lists is True, then lists and tuples are flattened as
77
+ well. If remove_none is True, then None keys/values are removed from the flattened
78
+ dictionary.
79
+ """
80
+ keys_to_omit = keys_to_omit if keys_to_omit else []
81
+ flattened_dict: dict[str, Any] = {}
82
+ for key, value in dictionary.items():
83
+ new_key = f"{parent_key}{separator}{key}" if parent_key else key
84
+ if new_key in keys_to_omit:
85
+ flattened_dict[new_key] = value
86
+ elif value is None and remove_none:
87
+ continue
88
+ elif isinstance(value, dict):
89
+ value = cast("dict[str, Any]", value)
90
+ flattened_dict.update(
91
+ flatten_dict(
92
+ value, new_key, separator, flatten_lists, remove_none, keys_to_omit=keys_to_omit
93
+ ),
94
+ )
95
+ elif isinstance(value, (list, tuple)) and flatten_lists:
96
+ value = cast("list[Any] | tuple[Any]", value)
97
+ for index, item in enumerate(value):
98
+ flattened_dict.update(
99
+ flatten_dict(
100
+ {f"{new_key}{separator}{index}": item},
101
+ "",
102
+ separator,
103
+ flatten_lists,
104
+ remove_none,
105
+ keys_to_omit=keys_to_omit,
106
+ )
107
+ )
108
+ else:
109
+ flattened_dict[new_key] = value
110
+
111
+ return flattened_dict
112
+
113
+
114
+ def validate_date_args(date: Optional[str] = None) -> bool:
115
+ """Validate whether the provided date string satisfies any of the supported date formats.
116
+
117
+ Used by unstructured/ingest/connector/biomed.py
118
+
119
+ Returns `True` if the date string satisfies any of the supported formats, otherwise raises
120
+ `ValueError`.
121
+
122
+ Supported Date Formats:
123
+ - 'YYYY-MM-DD'
124
+ - 'YYYY-MM-DDTHH:MM:SS'
125
+ - 'YYYY-MM-DD+HH:MM:SS'
126
+ - 'YYYY-MM-DDTHH:MM:SS±HHMM'
127
+ """
128
+ if not date:
129
+ raise ValueError("The argument date is None.")
130
+
131
+ for format in DATE_FORMATS:
132
+ try:
133
+ datetime.strptime(date, format)
134
+ return True
135
+ except ValueError:
136
+ pass
137
+
138
+ raise ValueError(
139
+ f"The argument {date} does not satisfy the format:"
140
+ f" YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or YYYY-MM-DD+HH:MM:SS or YYYY-MM-DDTHH:MM:SS±HHMM",
141
+ )
142
+
143
+
144
+ def get_data_by_suffix(path: Path) -> list[dict]:
145
+ with path.open() as f:
146
+ if path.suffix == ".json":
147
+ return json.load(f)
148
+ elif path.suffix == ".ndjson":
149
+ return ndjson.load(f)
150
+ elif path.suffix == ".csv":
151
+ import pandas as pd
152
+
153
+ df = pd.read_csv(path)
154
+ return df.to_dict(orient="records")
155
+ elif path.suffix == ".parquet":
156
+ import pandas as pd
157
+
158
+ df = pd.read_parquet(path)
159
+ return df.to_dict(orient="records")
160
+ else:
161
+ raise ValueError(f"Unsupported file type: {path}")
162
+
163
+
164
+ def write_data(path: Path, data: list[dict], indent: Optional[int] = 2) -> None:
165
+ with path.open("w") as f:
166
+ if path.suffix == ".json":
167
+ json.dump(data, f, indent=indent, ensure_ascii=False)
168
+ elif path.suffix == ".ndjson":
169
+ ndjson.dump(data, f, ensure_ascii=False)
170
+ else:
171
+ raise IOError("Unsupported file type: {path}")
172
+
173
+
174
+ def get_json_data(path: Path) -> list[dict]:
175
+ with path.open() as f:
176
+ # Attempt by prefix
177
+ if path.suffix == ".json":
178
+ return json.load(f)
179
+ elif path.suffix == ".ndjson":
180
+ return ndjson.load(f)
181
+ try:
182
+ return json.load(f)
183
+ except Exception as e:
184
+ logger.warning(f"failed to read {path} as json: {e}")
185
+ try:
186
+ return ndjson.load(f)
187
+ except Exception as e:
188
+ logger.warning(f"failed to read {path} as ndjson: {e}")
189
+ raise ValueError(f"Unsupported json file: {path}")
190
+
191
+
192
+ @requires_dependencies(["pandas"])
193
+ def get_data_df(path: Path) -> "DataFrame":
194
+ import pandas as pd
195
+
196
+ with path.open() as f:
197
+ if path.suffix == ".json":
198
+ data = json.load(f)
199
+ return pd.DataFrame(data=data)
200
+ elif path.suffix == ".ndjson":
201
+ data = ndjson.load(f)
202
+ return pd.DataFrame(data=data)
203
+ elif path.suffix == ".csv":
204
+ df = pd.read_csv(path)
205
+ return df
206
+ elif path.suffix == ".parquet":
207
+ df = pd.read_parquet(path)
208
+ return df
209
+ else:
210
+ raise ValueError(f"Unsupported file type: {path}")
211
+
212
+
213
+ def get_enhanced_element_id(element_dict: dict, file_data: FileData) -> str:
214
+ element_id = element_dict.get("element_id")
215
+ new_data = f"{element_id}{file_data.identifier}"
216
+ return str(uuid5(NAMESPACE_DNS, new_data))
@@ -0,0 +1,78 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import importlib
5
+ from functools import wraps
6
+ from typing import (
7
+ Callable,
8
+ List,
9
+ Optional,
10
+ TypeVar,
11
+ )
12
+
13
+ from typing_extensions import ParamSpec
14
+
15
+ _T = TypeVar("_T")
16
+ _P = ParamSpec("_P")
17
+
18
+
19
+ def requires_dependencies(
20
+ dependencies: str | list[str],
21
+ extras: Optional[str] = None,
22
+ ) -> Callable[[Callable[_P, _T]], Callable[_P, _T]]:
23
+ """Decorator ensuring required modules are installed.
24
+
25
+ Use on functions with local imports to ensure required modules are available and log
26
+ an installation instruction if they're not.
27
+
28
+ Args:
29
+ dependencies: Name(s) of module(s) required by the decorated function.
30
+ extras: unstructured-ingest extra which installs required `dependencies`. Defaults to None.
31
+
32
+ Raises:
33
+ ImportError: When at least one of the `dependencies` is not available.
34
+ """
35
+ if isinstance(dependencies, str):
36
+ dependencies = [dependencies]
37
+
38
+ def decorator(func: Callable[_P, _T]) -> Callable[_P, _T]:
39
+ def run_check():
40
+ missing_deps: List[str] = []
41
+ for dep in dependencies:
42
+ if not dependency_exists(dep):
43
+ missing_deps.append(dep)
44
+ if len(missing_deps) > 0:
45
+ raise ImportError(
46
+ f"Following dependencies are missing: {', '.join(missing_deps)}. "
47
+ + (
48
+ f"""Please install them using `pip install "unstructured-ingest[{extras}]"`.""" # noqa: E501
49
+ if extras
50
+ else f"Please install them using `pip install {' '.join(missing_deps)}`."
51
+ ),
52
+ )
53
+
54
+ @wraps(func)
55
+ def wrapper(*args: _P.args, **kwargs: _P.kwargs):
56
+ run_check()
57
+ return func(*args, **kwargs)
58
+
59
+ @wraps(func)
60
+ async def wrapper_async(*args: _P.args, **kwargs: _P.kwargs):
61
+ run_check()
62
+ return await func(*args, **kwargs)
63
+
64
+ if asyncio.iscoroutinefunction(func):
65
+ return wrapper_async
66
+ return wrapper
67
+
68
+ return decorator
69
+
70
+
71
+ def dependency_exists(dependency: str):
72
+ try:
73
+ importlib.import_module(dependency)
74
+ except ImportError as e:
75
+ # Check to make sure this isn't some unrelated import error.
76
+ if dependency in repr(e):
77
+ return False
78
+ return True
@@ -0,0 +1,27 @@
1
+ """
2
+ Filesystem utilities for concurrent operations.
3
+
4
+ This module provides race-condition-safe filesystem operations that are needed
5
+ when multiple processes operate on the same directory structures simultaneously.
6
+ """
7
+
8
+ from pathlib import Path
9
+
10
+
11
+ def mkdir_concurrent_safe(path: Path) -> None:
12
+ """
13
+ Create directory safely in concurrent environments, handling race conditions.
14
+
15
+ This addresses the issue where Path.mkdir(parents=True, exist_ok=True) can still
16
+ raise FileExistsError when multiple processes attempt to create overlapping
17
+ directory structures simultaneously. In this codebase, this occurs when multiple
18
+ files are being downloaded in parallel and archive extraction is happening in parallel.
19
+
20
+ Related: https://github.com/python/cpython/pull/112966/files
21
+ Python core team used the same approach to fix zipfile race conditions.
22
+ """
23
+ try:
24
+ path.mkdir(parents=True, exist_ok=True)
25
+ except FileExistsError:
26
+ if not (path.exists() and path.is_dir()):
27
+ raise
@@ -0,0 +1,174 @@
1
+ import base64
2
+ from pathlib import Path
3
+ from typing import TYPE_CHECKING, Optional
4
+ from urllib.parse import urlparse
5
+ from uuid import NAMESPACE_DNS, uuid5
6
+
7
+ from pydantic import BaseModel, Field
8
+
9
+ from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
10
+ from unstructured_ingest.interfaces import DownloadResponse
11
+ from unstructured_ingest.logger import logger
12
+ from unstructured_ingest.utils.dep_check import requires_dependencies
13
+
14
+ if TYPE_CHECKING:
15
+ from bs4 import BeautifulSoup
16
+ from bs4.element import Tag
17
+ from requests import Session
18
+
19
+
20
+ class HtmlMixin(BaseModel):
21
+ extract_images: bool = Field(
22
+ default=False,
23
+ description="if true, will download images and replace "
24
+ "the html content with base64 encoded images",
25
+ )
26
+ extract_files: bool = Field(
27
+ default=False, description="if true, will download any embedded files"
28
+ )
29
+ force_download: bool = Field(
30
+ default=False,
31
+ description="if true, will redownload extracted files even if they already exist locally",
32
+ )
33
+ allow_list: Optional[list[str]] = Field(
34
+ default=None,
35
+ description="list of allowed urls to download, if not set, "
36
+ "will default to the base url the original HTML came from",
37
+ )
38
+
39
+ @requires_dependencies(["requests"])
40
+ def get_default_session(self) -> "Session":
41
+ import requests
42
+
43
+ return requests.Session()
44
+
45
+ def get_absolute_url(self, tag_link: str, url: str) -> str:
46
+ parsed_url = urlparse(url)
47
+ base_url = parsed_url.scheme + "://" + parsed_url.netloc
48
+ if tag_link.startswith("//"):
49
+ return f"{parsed_url.scheme}:{tag_link}"
50
+ elif tag_link.startswith("http"):
51
+ return tag_link
52
+ else:
53
+ tag_link = tag_link.lstrip("/")
54
+ return f"{base_url}/{tag_link}"
55
+
56
+ def download_content(self, url: str, session: "Session") -> bytes:
57
+ response = session.get(url)
58
+ response.raise_for_status()
59
+ return response.content
60
+
61
+ def can_download(self, url_to_download: str, original_url: str) -> bool:
62
+ parsed_original_url = urlparse(original_url)
63
+ base_url = parsed_original_url.scheme + "://" + parsed_original_url.netloc
64
+ allow_list = self.allow_list or [base_url]
65
+ for allowed_url in allow_list:
66
+ if url_to_download.startswith(allowed_url):
67
+ return True
68
+ logger.info(f"Skipping url because it does not match the allow list: {url_to_download}")
69
+ return False
70
+
71
+ def extract_image_src(self, image: "Tag", url: str, session: "Session") -> "Tag":
72
+ current_src = image["src"]
73
+ if current_src.startswith("data:image/png;base64"):
74
+ # already base64 encoded
75
+ return image
76
+ absolute_url = self.get_absolute_url(tag_link=image["src"], url=url)
77
+ if not self.can_download(url_to_download=absolute_url, original_url=url):
78
+ return image
79
+ image_content = self.download_content(url=absolute_url, session=session)
80
+ logger.debug("img tag having src updated from {} to base64 content".format(image["src"]))
81
+ image["src"] = f"data:image/png;base64,{base64.b64encode(image_content).decode()}"
82
+ return image
83
+
84
+ @requires_dependencies(["bs4"])
85
+ def extract_html_images(self, url: str, html: str, session: Optional["Session"] = None) -> str:
86
+ from bs4 import BeautifulSoup
87
+
88
+ session = session or self.get_default_session()
89
+ soup = BeautifulSoup(html, "html.parser")
90
+ images = soup.find_all("img")
91
+ for image in images:
92
+ self.extract_image_src(image=image, url=url, session=session)
93
+ return str(soup)
94
+
95
+ @requires_dependencies(["bs4"])
96
+ def get_hrefs(self, url: str, html: str) -> list:
97
+ from bs4 import BeautifulSoup
98
+
99
+ soup = BeautifulSoup(html, "html.parser")
100
+ tags = self._find_hyperlink_tags(soup)
101
+ hrefs = [
102
+ tag["href"]
103
+ for tag in tags
104
+ if not tag["href"].startswith("#") and Path(tag["href"]).suffix != ""
105
+ ]
106
+ absolute_urls = [self.get_absolute_url(tag_link=href, url=url) for href in hrefs]
107
+ allowed_urls = [
108
+ url_to_download
109
+ for url_to_download in absolute_urls
110
+ if self.can_download(url_to_download=url_to_download, original_url=url)
111
+ ]
112
+ return allowed_urls
113
+
114
+ def write_content(self, content: bytes, path: Path) -> None:
115
+ if path.exists() and path.is_file() and not self.force_download:
116
+ return
117
+ if not path.parent.exists():
118
+ path.parent.mkdir(parents=True)
119
+ with path.open("wb") as f:
120
+ f.write(content)
121
+
122
+ def get_download_response(
123
+ self, url: str, download_dir: Path, file_data: FileData, session: "Session"
124
+ ) -> DownloadResponse:
125
+ filename = Path(urlparse(url=url).path).name
126
+ download_path = download_dir / filename
127
+ self.write_content(
128
+ content=self.download_content(url=url, session=session), path=download_path
129
+ )
130
+ result_file_data = file_data.model_copy(deep=True)
131
+ result_file_data.metadata.url = url
132
+ result_file_data.display_name = filename
133
+ if result_file_data.metadata.record_locator is None:
134
+ result_file_data.metadata.record_locator = {}
135
+ result_file_data.metadata.record_locator["parent_url"] = url
136
+ result_file_data.identifier = str(uuid5(NAMESPACE_DNS, url + file_data.identifier))
137
+ filename = Path(urlparse(url=url).path).name
138
+ result_file_data.source_identifiers = SourceIdentifiers(
139
+ filename=filename, fullpath=filename
140
+ )
141
+ result_file_data.local_download_path = download_path.as_posix()
142
+ return DownloadResponse(file_data=result_file_data, path=download_path)
143
+
144
+ def extract_embedded_files(
145
+ self,
146
+ url: str,
147
+ html: str,
148
+ download_dir: Path,
149
+ original_filedata: FileData,
150
+ session: Optional["Session"] = None,
151
+ ) -> list[DownloadResponse]:
152
+ session = session or self.get_default_session()
153
+ urls_to_download = self.get_hrefs(url=url, html=html)
154
+ return [
155
+ self.get_download_response(
156
+ url=url_to_download,
157
+ download_dir=download_dir,
158
+ file_data=original_filedata,
159
+ session=session,
160
+ )
161
+ for url_to_download in urls_to_download
162
+ ]
163
+
164
+ @requires_dependencies(["bs4"])
165
+ def _find_hyperlink_tags(self, html_soup: "BeautifulSoup") -> list["Tag"]:
166
+ """Find hyperlink tags in the HTML.
167
+
168
+ Overwrite this method to customize the tag search.
169
+ """
170
+ from bs4.element import Tag
171
+
172
+ return [
173
+ element for element in html_soup.find_all("a", href=True) if isinstance(element, Tag)
174
+ ]
@@ -0,0 +1,52 @@
1
+ import json
2
+ from typing import IO, Any
3
+
4
+
5
+ def dumps(obj: list[dict[str, Any]], **kwargs) -> str:
6
+ return "\n".join(json.dumps(each, **kwargs) for each in obj)
7
+
8
+
9
+ def dump(obj: list[dict[str, Any]], fp: IO, **kwargs) -> None:
10
+ # Indent breaks ndjson formatting
11
+ kwargs["indent"] = None
12
+ text = dumps(obj, **kwargs)
13
+ fp.write(text)
14
+
15
+
16
+ def loads(s: str, **kwargs) -> list[dict[str, Any]]:
17
+ return [json.loads(line, **kwargs) for line in s.splitlines()]
18
+
19
+
20
+ def load(fp: IO, **kwargs) -> list[dict[str, Any]]:
21
+ return loads(fp.read(), **kwargs)
22
+
23
+
24
+ class writer(object):
25
+ def __init__(self, f, **kwargs):
26
+ self.f = f
27
+ self.kwargs = kwargs
28
+
29
+ def write(self, row):
30
+ stringified = json.dumps(row, **self.kwargs)
31
+ self.f.write(stringified + "\n")
32
+
33
+
34
+ class reader(object):
35
+ def __init__(self, f, **kwargs):
36
+ self.f = f
37
+ self.kwargs = kwargs
38
+
39
+ def __iter__(self):
40
+ return self
41
+
42
+ def __next__(self):
43
+ line = ""
44
+
45
+ while line == "":
46
+ line = next(self.f).strip()
47
+
48
+ return json.loads(line, **self.kwargs)
49
+
50
+ # NOTE: this is necessary to comply with py27
51
+ def next(self):
52
+ return self.__next__()
@@ -0,0 +1,52 @@
1
+ import json
2
+ from datetime import datetime
3
+ from inspect import isclass
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ from pydantic import BaseModel
8
+ from pydantic.types import _SecretBase
9
+
10
+
11
+ def is_secret(value: Any) -> bool:
12
+ # Case Secret[int]
13
+ if hasattr(value, "__origin__") and hasattr(value, "__args__"):
14
+ origin = value.__origin__
15
+ return isclass(origin) and issubclass(origin, _SecretBase)
16
+ # Case SecretStr
17
+ return isclass(value) and issubclass(value, _SecretBase)
18
+
19
+
20
+ def serialize_base_model(model: BaseModel) -> dict:
21
+ # To get the full serialized dict regardless of if values are marked as Secret
22
+ model_dict = model.model_dump()
23
+ return serialize_base_dict(model_dict=model_dict)
24
+
25
+
26
+ def serialize_base_dict(model_dict: dict) -> dict:
27
+ model_dict = model_dict.copy()
28
+ for k, v in model_dict.items():
29
+ if isinstance(v, _SecretBase):
30
+ secret_value = v.get_secret_value()
31
+ if isinstance(secret_value, BaseModel):
32
+ model_dict[k] = serialize_base_model(model=secret_value)
33
+ else:
34
+ model_dict[k] = secret_value
35
+ if isinstance(v, dict):
36
+ model_dict[k] = serialize_base_dict(model_dict=v)
37
+
38
+ return model_dict
39
+
40
+
41
+ def serialize_base_model_json(model: BaseModel, **json_kwargs) -> str:
42
+ model_dict = serialize_base_model(model=model)
43
+
44
+ def json_serial(obj):
45
+ if isinstance(obj, Path):
46
+ return obj.as_posix()
47
+ if isinstance(obj, datetime):
48
+ return obj.isoformat()
49
+ raise TypeError("Type %s not serializable" % type(obj))
50
+
51
+ # Support json dumps kwargs such as sort_keys
52
+ return json.dumps(model_dict, default=json_serial, **json_kwargs)