unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,522 @@
1
+ from collections import abc
2
+ from contextlib import contextmanager
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+ from time import time
6
+ from typing import TYPE_CHECKING, Any, Callable, Generator, List, Optional, Union
7
+
8
+ from pydantic import BaseModel, Field, Secret
9
+
10
+ from unstructured_ingest.data_types.file_data import (
11
+ FileData,
12
+ FileDataSourceMetadata,
13
+ SourceIdentifiers,
14
+ )
15
+ from unstructured_ingest.error import SourceConnectionError, ValueError
16
+ from unstructured_ingest.interfaces import (
17
+ AccessConfig,
18
+ ConnectionConfig,
19
+ Downloader,
20
+ DownloaderConfig,
21
+ DownloadResponse,
22
+ Indexer,
23
+ IndexerConfig,
24
+ download_responses,
25
+ )
26
+ from unstructured_ingest.logger import logger
27
+ from unstructured_ingest.processes.connector_registry import (
28
+ SourceRegistryEntry,
29
+ )
30
+ from unstructured_ingest.utils.dep_check import requires_dependencies
31
+
32
+ if TYPE_CHECKING:
33
+ from atlassian import Jira
34
+
35
+ CONNECTOR_TYPE = "jira"
36
+
37
+ DEFAULT_C_SEP = " " * 5
38
+ DEFAULT_R_SEP = "\n"
39
+
40
+
41
+ class JiraIssueMetadata(BaseModel):
42
+ id: str
43
+ key: str
44
+ fields: Optional[dict] = None # Add fields to capture attachment data
45
+
46
+ def get_project_id(self) -> str:
47
+ return self.key.split("-")[0]
48
+
49
+ def get_attachments(self) -> List[dict]:
50
+ """Extract attachment information from fields"""
51
+ if self.fields and "attachment" in self.fields:
52
+ return self.fields["attachment"]
53
+ return []
54
+
55
+
56
+ class FieldGetter(dict):
57
+ def __getitem__(self, key):
58
+ value = super().__getitem__(key) if key in self else None
59
+ if value is None:
60
+ value = FieldGetter({})
61
+ return value
62
+
63
+
64
+ def nested_object_to_field_getter(obj: dict) -> Union[FieldGetter, dict]:
65
+ if isinstance(obj, abc.Mapping):
66
+ new_object = {}
67
+ for k, v in obj.items():
68
+ if isinstance(v, abc.Mapping):
69
+ new_object[k] = FieldGetter(nested_object_to_field_getter(v))
70
+ else:
71
+ new_object[k] = v
72
+ return FieldGetter(new_object)
73
+ else:
74
+ return obj
75
+
76
+
77
+ def api_token_based_generator(
78
+ fn: Callable, key: str = "issues", **kwargs
79
+ ) -> Generator[dict, None, None]:
80
+ nextPageToken = kwargs.pop("nextPageToken", None)
81
+ while True:
82
+ resp = fn(nextPageToken=nextPageToken, **kwargs)
83
+ issues = resp.get(key, [])
84
+ for issue in issues:
85
+ yield issue
86
+ nextPageToken = resp.get("nextPageToken")
87
+ if not nextPageToken:
88
+ break
89
+
90
+
91
+ def api_page_based_generator(
92
+ fn: Callable, key: str = "issues", **kwargs
93
+ ) -> Generator[dict, None, None]:
94
+ start = kwargs.pop("start", 0)
95
+ while True:
96
+ resp = fn(start=start, **kwargs)
97
+ issues = resp.get(key, [])
98
+ if not issues:
99
+ break
100
+ for issue in issues:
101
+ yield issue
102
+ start += len(issues)
103
+
104
+
105
+ class JiraAccessConfig(AccessConfig):
106
+ password: Optional[str] = Field(
107
+ description="Jira password or Cloud API token",
108
+ default=None,
109
+ )
110
+ token: Optional[str] = Field(
111
+ description="Jira Personal Access Token",
112
+ default=None,
113
+ )
114
+
115
+
116
+ class JiraConnectionConfig(ConnectionConfig):
117
+ url: str = Field(description="URL of the Jira instance")
118
+ username: Optional[str] = Field(
119
+ description="Username or email for authentication",
120
+ default=None,
121
+ )
122
+ cloud: bool = Field(description="Authenticate to Jira Cloud", default=False)
123
+ access_config: Secret[JiraAccessConfig] = Field(description="Access configuration for Jira")
124
+
125
+ def model_post_init(self, __context):
126
+ access_configs = self.access_config.get_secret_value()
127
+ basic_auth = self.username and access_configs.password
128
+ pat_auth = access_configs.token
129
+ if self.cloud and not basic_auth:
130
+ raise ValueError(
131
+ "cloud authentication requires username and API token (--password), "
132
+ "see: https://atlassian-python-api.readthedocs.io/"
133
+ )
134
+ if basic_auth and pat_auth:
135
+ raise ValueError(
136
+ "both password and token provided, only one allowed, "
137
+ "see: https://atlassian-python-api.readthedocs.io/"
138
+ )
139
+ if not (basic_auth or pat_auth):
140
+ raise ValueError(
141
+ "no form of auth provided, see: https://atlassian-python-api.readthedocs.io/"
142
+ )
143
+
144
+ @requires_dependencies(["atlassian"], extras="jira")
145
+ @contextmanager
146
+ def get_client(self) -> Generator["Jira", None, None]:
147
+ from atlassian import Jira
148
+
149
+ access_configs = self.access_config.get_secret_value()
150
+ with Jira(
151
+ url=self.url,
152
+ username=self.username,
153
+ password=access_configs.password,
154
+ token=access_configs.token,
155
+ cloud=self.cloud,
156
+ ) as client:
157
+ yield client
158
+
159
+
160
+ class JiraIndexerConfig(IndexerConfig):
161
+ projects: Optional[list[str]] = Field(None, description="List of project keys")
162
+ boards: Optional[list[str]] = Field(None, description="List of board IDs")
163
+ issues: Optional[list[str]] = Field(None, description="List of issue keys or IDs")
164
+ status_filters: Optional[list[str]] = Field(
165
+ default=None,
166
+ description="List of status filters, if provided will only return issues that have these statuses", # noqa: E501
167
+ )
168
+
169
+ def model_post_init(self, context: Any, /) -> None:
170
+ if not self.projects and not self.boards and not self.issues:
171
+ raise ValueError("At least one of projects, boards, or issues must be provided.")
172
+
173
+
174
+ @dataclass
175
+ class JiraIndexer(Indexer):
176
+ connection_config: JiraConnectionConfig
177
+ index_config: JiraIndexerConfig
178
+ connector_type: str = CONNECTOR_TYPE
179
+
180
+ def precheck(self) -> None:
181
+ try:
182
+ with self.connection_config.get_client() as client:
183
+ response = client.get_permissions("BROWSE_PROJECTS")
184
+ permitted = response["permissions"]["BROWSE_PROJECTS"]["havePermission"]
185
+ except Exception as e:
186
+ logger.error(f"Failed to connect to Jira: {e}", exc_info=True)
187
+ raise SourceConnectionError(f"Failed to connect to Jira: {e}")
188
+ if not permitted:
189
+ raise ValueError(
190
+ """The provided user is not permitted to browse projects
191
+ from the given Jira organization URL.
192
+ Try checking username, password, token and the url arguments.""",
193
+ )
194
+ logger.info("Connection to Jira successful.")
195
+
196
+ def run_jql(self, jql: str, **kwargs) -> Generator[JiraIssueMetadata, None, None]:
197
+ with self.connection_config.get_client() as client:
198
+ if client.cloud:
199
+ for issue in api_token_based_generator(client.enhanced_jql, jql=jql, **kwargs):
200
+ yield JiraIssueMetadata.model_validate(issue)
201
+ else:
202
+ for issue in api_page_based_generator(client.jql, jql=jql, **kwargs):
203
+ yield JiraIssueMetadata.model_validate(issue)
204
+
205
+ def _get_issues_within_projects(self) -> Generator[JiraIssueMetadata, None, None]:
206
+ fields = ["key", "id", "status", "attachment"] # Add attachment field
207
+ jql = "project in ({})".format(", ".join(self.index_config.projects))
208
+ jql = self._update_jql(jql)
209
+ logger.debug(f"running jql: {jql}")
210
+ return self.run_jql(jql=jql, fields=fields)
211
+
212
+ def _get_issues_within_single_board(
213
+ self, board_id: str
214
+ ) -> Generator[JiraIssueMetadata, None, None]:
215
+ with self.connection_config.get_client() as client:
216
+ fields = ["key", "id", "attachment"] # Add attachment field
217
+ if self.index_config.status_filters:
218
+ jql = "status in ({}) ORDER BY id".format(
219
+ ", ".join([f'"{s}"' for s in self.index_config.status_filters])
220
+ )
221
+ else:
222
+ jql = "ORDER BY id"
223
+ logger.debug(f"running jql for board {board_id}: {jql}")
224
+ for issue in api_page_based_generator(
225
+ fn=client.get_issues_for_board, board_id=board_id, fields=fields, jql=jql
226
+ ):
227
+ yield JiraIssueMetadata.model_validate(issue)
228
+
229
+ def _get_issues_within_boards(self) -> Generator[JiraIssueMetadata, None, None]:
230
+ if not self.index_config.boards:
231
+ yield
232
+ for board_id in self.index_config.boards:
233
+ for issue in self._get_issues_within_single_board(board_id=board_id):
234
+ yield issue
235
+
236
+ def _update_jql(self, jql: str) -> str:
237
+ if self.index_config.status_filters:
238
+ jql += " and status in ({})".format(
239
+ ", ".join([f'"{s}"' for s in self.index_config.status_filters])
240
+ )
241
+ jql = jql + " ORDER BY id"
242
+ return jql
243
+
244
+ def _get_issues_by_keys(self) -> Generator[JiraIssueMetadata, None, None]:
245
+ fields = ["key", "id", "attachment"] # Add attachment field
246
+ jql = "key in ({})".format(", ".join(self.index_config.issues))
247
+ jql = self._update_jql(jql)
248
+ logger.debug(f"running jql: {jql}")
249
+ return self.run_jql(jql=jql, fields=fields)
250
+
251
+ def _create_file_data_from_issue(self, issue: JiraIssueMetadata) -> FileData:
252
+ # Construct relative path and filename first
253
+ filename = f"{issue.key}.txt"
254
+ relative_path = str(Path(issue.get_project_id()) / filename)
255
+
256
+ # Build metadata with attachments included in record_locator
257
+ record_locator = {"id": issue.id, "key": issue.key, "full_path": relative_path}
258
+
259
+ # Add attachments to record_locator if they exist
260
+ attachments = issue.get_attachments()
261
+ if attachments:
262
+ record_locator["attachments"] = [
263
+ {
264
+ "id": att["id"],
265
+ "filename": att["filename"],
266
+ "created": att.get("created"),
267
+ "mimeType": att.get("mimeType"),
268
+ }
269
+ for att in attachments
270
+ ]
271
+
272
+ metadata = FileDataSourceMetadata(
273
+ date_processed=str(time()),
274
+ record_locator=record_locator,
275
+ )
276
+
277
+ source_identifiers = SourceIdentifiers(
278
+ filename=filename,
279
+ fullpath=relative_path,
280
+ rel_path=relative_path,
281
+ )
282
+
283
+ file_data = FileData(
284
+ identifier=issue.id,
285
+ connector_type=self.connector_type,
286
+ metadata=metadata,
287
+ additional_metadata=issue.model_dump(),
288
+ source_identifiers=source_identifiers,
289
+ display_name=source_identifiers.fullpath,
290
+ )
291
+ return file_data
292
+
293
+ def get_generators(self) -> List[Callable]:
294
+ generators = []
295
+ if self.index_config.boards:
296
+ generators.append(self._get_issues_within_boards)
297
+ if self.index_config.issues:
298
+ generators.append(self._get_issues_by_keys)
299
+ if self.index_config.projects:
300
+ generators.append(self._get_issues_within_projects)
301
+ return generators
302
+
303
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
304
+ seen_keys = []
305
+ for gen in self.get_generators():
306
+ for issue in gen():
307
+ if not issue:
308
+ continue
309
+ if issue.key in seen_keys:
310
+ continue
311
+ seen_keys.append(issue.key)
312
+ yield self._create_file_data_from_issue(issue=issue)
313
+
314
+
315
+ class JiraDownloaderConfig(DownloaderConfig):
316
+ download_attachments: bool = Field(
317
+ default=False, description="If True, will download any attachments and process as well"
318
+ )
319
+
320
+
321
+ @dataclass
322
+ class JiraDownloader(Downloader):
323
+ connection_config: JiraConnectionConfig
324
+ download_config: JiraDownloaderConfig = field(default_factory=JiraDownloaderConfig)
325
+ connector_type: str = CONNECTOR_TYPE
326
+
327
+ def _get_id_fields_for_issue(
328
+ self, issue: dict, c_sep: str = DEFAULT_C_SEP, r_sep: str = DEFAULT_R_SEP
329
+ ) -> str:
330
+ issue_id, key = issue["id"], issue["key"]
331
+ return f"IssueID_IssueKey:{issue_id}{c_sep}{key}{r_sep}"
332
+
333
+ def _get_project_fields_for_issue(
334
+ self, issue: dict, c_sep: str = DEFAULT_C_SEP, r_sep: str = DEFAULT_R_SEP
335
+ ) -> str:
336
+ if "project" in issue:
337
+ return (
338
+ f"ProjectID_Key:{issue['project']['key']}{c_sep}{issue['project']['name']}{r_sep}"
339
+ )
340
+ else:
341
+ return ""
342
+
343
+ def _get_dropdown_fields_for_issue(
344
+ self, issue: dict, c_sep: str = DEFAULT_C_SEP, r_sep: str = DEFAULT_R_SEP
345
+ ) -> str:
346
+ return f"""
347
+ IssueType:{issue["issuetype"]["name"]}
348
+ {r_sep}
349
+ Status:{issue["status"]["name"]}
350
+ {r_sep}
351
+ Priority:{issue["priority"]}
352
+ {r_sep}
353
+ AssigneeID_Name:{issue["assignee"]["accountId"]}{c_sep}{issue["assignee"]["displayName"]}
354
+ {r_sep}
355
+ ReporterAdr_Name:{issue["reporter"]["emailAddress"]}{c_sep}{issue["reporter"]["displayName"]}
356
+ {r_sep}
357
+ Labels:{c_sep.join(issue["labels"])}
358
+ {r_sep}
359
+ Components:{c_sep.join([component["name"] for component in issue["components"]])}
360
+ {r_sep}
361
+ """
362
+
363
+ def _get_subtasks_for_issue(self, issue: dict) -> str:
364
+ return ""
365
+
366
+ def _get_text_fields_for_issue(
367
+ self, issue: dict, c_sep: str = DEFAULT_C_SEP, r_sep: str = DEFAULT_R_SEP
368
+ ) -> str:
369
+ return f"""
370
+ {issue["summary"]}
371
+ {r_sep}
372
+ {issue["description"]}
373
+ {r_sep}
374
+ {c_sep.join([attachment["self"] for attachment in issue["attachment"]])}
375
+ {r_sep}
376
+ """
377
+
378
+ def _get_comments_for_issue(
379
+ self, issue: dict, c_sep: str = DEFAULT_C_SEP, r_sep: str = DEFAULT_R_SEP
380
+ ) -> str:
381
+ return c_sep.join(
382
+ [self._get_fields_for_comment(comment) for comment in issue["comment"]["comments"]],
383
+ )
384
+
385
+ def _get_fields_for_comment(
386
+ self, comment, c_sep: str = DEFAULT_C_SEP, r_sep: str = DEFAULT_R_SEP
387
+ ) -> str:
388
+ return f"{comment['author']['displayName']}{c_sep}{comment['body']}{r_sep}"
389
+
390
+ def form_templated_string(
391
+ self,
392
+ issue: dict,
393
+ parsed_fields: Union[FieldGetter, dict],
394
+ c_sep: str = "|||",
395
+ r_sep: str = "\n\n\n",
396
+ ) -> str:
397
+ """Forms a template string via parsing the fields from the API response object on the issue
398
+ The template string will be saved to the disk, and then will be processed by partition."""
399
+ return r_sep.join(
400
+ [
401
+ self._get_id_fields_for_issue(issue),
402
+ self._get_project_fields_for_issue(parsed_fields),
403
+ self._get_dropdown_fields_for_issue(parsed_fields),
404
+ self._get_subtasks_for_issue(parsed_fields),
405
+ self._get_comments_for_issue(parsed_fields),
406
+ self._get_text_fields_for_issue(parsed_fields),
407
+ ],
408
+ )
409
+
410
+ def update_file_data(self, file_data: FileData, issue: dict) -> None:
411
+ file_data.metadata.date_created = issue["fields"]["created"]
412
+ file_data.metadata.date_modified = issue["fields"]["updated"]
413
+ file_data.display_name = issue["fields"]["project"]["name"]
414
+
415
+ def get_issue(self, issue_key: str) -> dict:
416
+ try:
417
+ with self.connection_config.get_client() as client:
418
+ return client.issue(key=issue_key)
419
+ except Exception as e:
420
+ logger.error(f"Failed to fetch issue with key: {issue_key}: {e}", exc_info=True)
421
+ raise SourceConnectionError(f"Failed to fetch issue with key: {issue_key}: {e}")
422
+
423
+ def generate_attachment_file_data(
424
+ self, attachment_dict: dict, parent_filedata: FileData
425
+ ) -> FileData:
426
+ new_filedata = parent_filedata.model_copy(deep=True)
427
+
428
+ # Create attachment record_locator with parent context
429
+ attachment_record_locator = {
430
+ "id": attachment_dict["id"],
431
+ "filename": attachment_dict["filename"],
432
+ "created": attachment_dict.get("created"),
433
+ "mimeType": attachment_dict.get("mimeType"),
434
+ "parent": {
435
+ "id": parent_filedata.metadata.record_locator["id"],
436
+ "key": parent_filedata.metadata.record_locator["key"],
437
+ "full_path": parent_filedata.source_identifiers.fullpath,
438
+ },
439
+ }
440
+
441
+ # Append an identifier for attachment to not conflict with issue ids
442
+ new_filedata.identifier = "{}a".format(attachment_dict["id"])
443
+ filename = f"{attachment_dict['filename']}.{attachment_dict['id']}"
444
+ new_filedata.metadata.filesize_bytes = attachment_dict.get("size")
445
+ new_filedata.metadata.date_created = attachment_dict.get("created")
446
+ new_filedata.metadata.url = attachment_dict.get("self")
447
+ new_filedata.metadata.record_locator = attachment_record_locator
448
+ full_path = (
449
+ Path(parent_filedata.source_identifiers.fullpath).with_suffix("") / Path(filename)
450
+ ).as_posix()
451
+ new_filedata.metadata.record_locator["full_path"] = full_path
452
+ new_filedata.source_identifiers = SourceIdentifiers(
453
+ filename=filename,
454
+ # add issue_parent to the fullpath and rel_path
455
+ # to ensure that the attachment is saved in the same folder as the parent issue
456
+ fullpath=full_path,
457
+ rel_path=full_path,
458
+ )
459
+ return new_filedata
460
+
461
+ def process_attachments(
462
+ self, file_data: FileData, attachments: list[dict]
463
+ ) -> list[DownloadResponse]:
464
+ with self.connection_config.get_client() as client:
465
+ download_path = self.get_download_path(file_data)
466
+ attachment_download_dir = download_path.parent / "attachments"
467
+ attachment_download_dir.mkdir(parents=True, exist_ok=True)
468
+ download_responses = []
469
+ for attachment in attachments:
470
+ attachment_filename = Path(attachment["filename"])
471
+ attachment_id = attachment["id"]
472
+ attachment_download_path = attachment_download_dir / Path(
473
+ attachment_id
474
+ ).with_suffix(attachment_filename.suffix)
475
+ resp = client.get_attachment_content(attachment_id=attachment_id)
476
+ with open(attachment_download_path, "wb") as f:
477
+ f.write(resp)
478
+ attachment_filedata = self.generate_attachment_file_data(
479
+ attachment_dict=attachment, parent_filedata=file_data
480
+ )
481
+ download_responses.append(
482
+ self.generate_download_response(
483
+ file_data=attachment_filedata, download_path=attachment_download_path
484
+ )
485
+ )
486
+ return download_responses
487
+
488
+ def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
489
+ issue_key = file_data.additional_metadata.get("key")
490
+ if not issue_key:
491
+ raise ValueError("Issue key not found in metadata.")
492
+ issue = self.get_issue(issue_key)
493
+ parsed_fields = nested_object_to_field_getter(issue["fields"])
494
+ issue_str = self.form_templated_string(issue, parsed_fields)
495
+
496
+ download_path = self.get_download_path(file_data)
497
+ if download_path is None:
498
+ raise ValueError("File data is missing source identifiers data.")
499
+ download_path.parent.mkdir(parents=True, exist_ok=True)
500
+ with open(download_path, "w") as f:
501
+ f.write(issue_str)
502
+ self.update_file_data(file_data, issue)
503
+ download_response = self.generate_download_response(
504
+ file_data=file_data, download_path=download_path
505
+ )
506
+ if self.download_config.download_attachments and (
507
+ attachments := issue.get("fields", {}).get("attachment")
508
+ ):
509
+ attachment_responses = self.process_attachments(
510
+ file_data=file_data, attachments=attachments
511
+ )
512
+ download_response = [download_response] + attachment_responses
513
+ return download_response
514
+
515
+
516
+ jira_source_entry = SourceRegistryEntry(
517
+ connection_config=JiraConnectionConfig,
518
+ indexer_config=JiraIndexerConfig,
519
+ indexer=JiraIndexer,
520
+ downloader_config=JiraDownloaderConfig,
521
+ downloader=JiraDownloader,
522
+ )
@@ -0,0 +1,17 @@
1
+ from __future__ import annotations
2
+
3
+ from unstructured_ingest.processes.connector_registry import (
4
+ add_destination_entry,
5
+ add_source_entry,
6
+ )
7
+
8
+ from .cloud import CONNECTOR_TYPE as CLOUD_CONNECTOR
9
+ from .cloud import kafka_cloud_destination_entry, kafka_cloud_source_entry
10
+ from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR
11
+ from .local import kafka_local_destination_entry, kafka_local_source_entry
12
+
13
+ add_source_entry(source_type=LOCAL_CONNECTOR, entry=kafka_local_source_entry)
14
+ add_destination_entry(destination_type=LOCAL_CONNECTOR, entry=kafka_local_destination_entry)
15
+
16
+ add_source_entry(source_type=CLOUD_CONNECTOR, entry=kafka_cloud_source_entry)
17
+ add_destination_entry(destination_type=CLOUD_CONNECTOR, entry=kafka_cloud_destination_entry)
@@ -0,0 +1,121 @@
1
+ import socket
2
+ from dataclasses import dataclass
3
+ from typing import TYPE_CHECKING
4
+
5
+ from pydantic import Field, Secret, SecretStr
6
+
7
+ from unstructured_ingest.logger import logger
8
+ from unstructured_ingest.processes.connector_registry import (
9
+ DestinationRegistryEntry,
10
+ SourceRegistryEntry,
11
+ )
12
+ from unstructured_ingest.processes.connectors.kafka.kafka import (
13
+ KafkaAccessConfig,
14
+ KafkaConnectionConfig,
15
+ KafkaDownloader,
16
+ KafkaDownloaderConfig,
17
+ KafkaIndexer,
18
+ KafkaIndexerConfig,
19
+ KafkaUploader,
20
+ KafkaUploaderConfig,
21
+ )
22
+
23
+ if TYPE_CHECKING:
24
+ pass
25
+
26
+ CONNECTOR_TYPE = "kafka-cloud"
27
+
28
+
29
+ class CloudKafkaAccessConfig(KafkaAccessConfig):
30
+ kafka_api_key: SecretStr = Field(
31
+ description="Kafka API key to connect at the server", default=None
32
+ )
33
+ secret: SecretStr = Field(description="", default=None)
34
+
35
+
36
+ class CloudKafkaConnectionConfig(KafkaConnectionConfig):
37
+ access_config: Secret[CloudKafkaAccessConfig]
38
+
39
+ def get_consumer_configuration(self) -> dict:
40
+ bootstrap = self.bootstrap_server
41
+ port = self.port
42
+ access_config = self.access_config.get_secret_value()
43
+
44
+ conf = {
45
+ "bootstrap.servers": f"{bootstrap}:{port}",
46
+ "client.id": socket.gethostname(),
47
+ "group.id": self.group_id,
48
+ "enable.auto.commit": "false",
49
+ "auto.offset.reset": "earliest",
50
+ "sasl.username": access_config.kafka_api_key.get_secret_value(),
51
+ "sasl.password": access_config.secret.get_secret_value(),
52
+ "sasl.mechanism": "PLAIN",
53
+ "security.protocol": "SASL_SSL",
54
+ "logger": logger,
55
+ }
56
+
57
+ return conf
58
+
59
+ def get_producer_configuration(self) -> dict:
60
+ bootstrap = self.bootstrap_server
61
+ port = self.port
62
+ access_config = self.access_config.get_secret_value()
63
+
64
+ conf = {
65
+ "bootstrap.servers": f"{bootstrap}:{port}",
66
+ "sasl.username": access_config.kafka_api_key.get_secret_value(),
67
+ "sasl.password": access_config.secret.get_secret_value(),
68
+ "sasl.mechanism": "PLAIN",
69
+ "security.protocol": "SASL_SSL",
70
+ "logger": logger,
71
+ }
72
+
73
+ return conf
74
+
75
+
76
+ class CloudKafkaIndexerConfig(KafkaIndexerConfig):
77
+ pass
78
+
79
+
80
+ @dataclass
81
+ class CloudKafkaIndexer(KafkaIndexer):
82
+ connection_config: CloudKafkaConnectionConfig
83
+ index_config: CloudKafkaIndexerConfig
84
+ connector_type: str = CONNECTOR_TYPE
85
+
86
+
87
+ class CloudKafkaDownloaderConfig(KafkaDownloaderConfig):
88
+ pass
89
+
90
+
91
+ @dataclass
92
+ class CloudKafkaDownloader(KafkaDownloader):
93
+ connection_config: CloudKafkaConnectionConfig
94
+ download_config: CloudKafkaDownloaderConfig
95
+ connector_type: str = CONNECTOR_TYPE
96
+
97
+
98
+ class CloudKafkaUploaderConfig(KafkaUploaderConfig):
99
+ pass
100
+
101
+
102
+ @dataclass
103
+ class CloudKafkaUploader(KafkaUploader):
104
+ connection_config: CloudKafkaConnectionConfig
105
+ upload_config: CloudKafkaUploaderConfig
106
+ connector_type: str = CONNECTOR_TYPE
107
+
108
+
109
+ kafka_cloud_source_entry = SourceRegistryEntry(
110
+ connection_config=CloudKafkaConnectionConfig,
111
+ indexer=CloudKafkaIndexer,
112
+ indexer_config=CloudKafkaIndexerConfig,
113
+ downloader=CloudKafkaDownloader,
114
+ downloader_config=CloudKafkaDownloaderConfig,
115
+ )
116
+
117
+ kafka_cloud_destination_entry = DestinationRegistryEntry(
118
+ connection_config=CloudKafkaConnectionConfig,
119
+ uploader=CloudKafkaUploader,
120
+ uploader_config=CloudKafkaUploaderConfig,
121
+ )