unstructured-ingest 0.5.9__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_dropbox.py +151 -0
- test/integration/connectors/test_jira.py +67 -0
- test/unit/test_utils.py +27 -0
- test/unit/v2/connectors/test_jira.py +401 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/embed/openai.py +4 -3
- unstructured_ingest/utils/string_and_date_utils.py +25 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +4 -0
- unstructured_ingest/v2/processes/connectors/confluence.py +2 -2
- unstructured_ingest/v2/processes/connectors/delta_table.py +2 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +78 -15
- unstructured_ingest/v2/processes/connectors/jira.py +453 -0
- unstructured_ingest/v2/processes/partitioner.py +2 -5
- unstructured_ingest/v2/unstructured_api.py +7 -0
- {unstructured_ingest-0.5.9.dist-info → unstructured_ingest-0.5.10.dist-info}/METADATA +16 -16
- {unstructured_ingest-0.5.9.dist-info → unstructured_ingest-0.5.10.dist-info}/RECORD +20 -16
- {unstructured_ingest-0.5.9.dist-info → unstructured_ingest-0.5.10.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.5.9.dist-info → unstructured_ingest-0.5.10.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.5.9.dist-info → unstructured_ingest-0.5.10.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.5.9.dist-info → unstructured_ingest-0.5.10.dist-info}/top_level.txt +0 -0
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import re
|
|
2
3
|
import typing as t
|
|
3
4
|
from datetime import datetime
|
|
4
5
|
|
|
5
6
|
from dateutil import parser
|
|
6
7
|
|
|
8
|
+
from unstructured_ingest.v2.logger import logger
|
|
9
|
+
|
|
7
10
|
|
|
8
11
|
def json_to_dict(json_string: str) -> t.Union[str, t.Dict[str, t.Any]]:
|
|
9
12
|
"""Helper function attempts to deserialize json string to a dictionary."""
|
|
@@ -47,3 +50,25 @@ def truncate_string_bytes(string: str, max_bytes: int, encoding: str = "utf-8")
|
|
|
47
50
|
if len(encoded_string) <= max_bytes:
|
|
48
51
|
return string
|
|
49
52
|
return encoded_string[:max_bytes].decode(encoding, errors="ignore")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def fix_unescaped_unicode(text: str, encoding: str = "utf-8") -> str:
|
|
56
|
+
"""
|
|
57
|
+
Fix unescaped Unicode sequences in text.
|
|
58
|
+
"""
|
|
59
|
+
try:
|
|
60
|
+
_text: str = json.dumps(text)
|
|
61
|
+
|
|
62
|
+
# Pattern to match unescaped Unicode sequences like \\uXXXX
|
|
63
|
+
pattern = r"\\\\u([0-9A-Fa-f]{4})"
|
|
64
|
+
# Replace with properly escaped Unicode sequences \uXXXX
|
|
65
|
+
_text = re.sub(pattern, r"\\u\1", _text)
|
|
66
|
+
_text = json.loads(_text)
|
|
67
|
+
|
|
68
|
+
# Encode the text to check for encoding errors
|
|
69
|
+
_text.encode(encoding)
|
|
70
|
+
return _text
|
|
71
|
+
except Exception as e:
|
|
72
|
+
# Return original text if encoding fails
|
|
73
|
+
logger.warning(f"Failed to fix unescaped Unicode sequences: {e}", exc_info=True)
|
|
74
|
+
return text
|
|
@@ -34,6 +34,8 @@ from .gitlab import CONNECTOR_TYPE as GITLAB_CONNECTOR_TYPE
|
|
|
34
34
|
from .gitlab import gitlab_source_entry
|
|
35
35
|
from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
|
|
36
36
|
from .google_drive import google_drive_source_entry
|
|
37
|
+
from .jira import CONNECTOR_TYPE as JIRA_CONNECTOR_TYPE
|
|
38
|
+
from .jira import jira_source_entry
|
|
37
39
|
from .kdbai import CONNECTOR_TYPE as KDBAI_CONNECTOR_TYPE
|
|
38
40
|
from .kdbai import kdbai_destination_entry
|
|
39
41
|
from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE
|
|
@@ -115,3 +117,5 @@ add_source_entry(source_type=CONFLUENCE_CONNECTOR_TYPE, entry=confluence_source_
|
|
|
115
117
|
|
|
116
118
|
add_source_entry(source_type=DISCORD_CONNECTOR_TYPE, entry=discord_source_entry)
|
|
117
119
|
add_destination_entry(destination_type=REDIS_CONNECTOR_TYPE, entry=redis_destination_entry)
|
|
120
|
+
|
|
121
|
+
add_source_entry(source_type=JIRA_CONNECTOR_TYPE, entry=jira_source_entry)
|
|
@@ -8,6 +8,7 @@ from pydantic import Field, Secret
|
|
|
8
8
|
from unstructured_ingest.error import SourceConnectionError
|
|
9
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
10
|
from unstructured_ingest.utils.html import HtmlMixin
|
|
11
|
+
from unstructured_ingest.utils.string_and_date_utils import fix_unescaped_unicode
|
|
11
12
|
from unstructured_ingest.v2.interfaces import (
|
|
12
13
|
AccessConfig,
|
|
13
14
|
ConnectionConfig,
|
|
@@ -224,7 +225,6 @@ class ConfluenceDownloader(Downloader):
|
|
|
224
225
|
page_id=doc_id,
|
|
225
226
|
expand="history.lastUpdated,version,body.view",
|
|
226
227
|
)
|
|
227
|
-
|
|
228
228
|
except Exception as e:
|
|
229
229
|
logger.error(f"Failed to retrieve page with ID {doc_id}: {e}", exc_info=True)
|
|
230
230
|
raise SourceConnectionError(f"Failed to retrieve page with ID {doc_id}: {e}")
|
|
@@ -236,7 +236,7 @@ class ConfluenceDownloader(Downloader):
|
|
|
236
236
|
title = page["title"]
|
|
237
237
|
# Using h1 for title is supported by both v1 and v2 html parsing in unstructured
|
|
238
238
|
title_html = f"<h1>{title}</h1>"
|
|
239
|
-
content = f"<body class='Document' >{title_html}{content}</body>"
|
|
239
|
+
content = fix_unescaped_unicode(f"<body class='Document' >{title_html}{content}</body>")
|
|
240
240
|
if self.download_config.extract_images:
|
|
241
241
|
with self.connection_config.get_client() as client:
|
|
242
242
|
content = self.download_config.extract_html_images(
|
|
@@ -92,6 +92,7 @@ class DeltaTableUploadStager(UploadStager):
|
|
|
92
92
|
output_path = Path(output_dir) / Path(f"{output_filename}.parquet")
|
|
93
93
|
|
|
94
94
|
df = convert_to_pandas_dataframe(elements_dict=elements_contents)
|
|
95
|
+
df = df.dropna(axis=1, how="all")
|
|
95
96
|
df.to_parquet(output_path)
|
|
96
97
|
|
|
97
98
|
return output_path
|
|
@@ -153,6 +154,7 @@ class DeltaTableUploader(Uploader):
|
|
|
153
154
|
"table_or_uri": updated_upload_path,
|
|
154
155
|
"data": df,
|
|
155
156
|
"mode": "overwrite",
|
|
157
|
+
"schema_mode": "merge",
|
|
156
158
|
"storage_options": storage_options,
|
|
157
159
|
}
|
|
158
160
|
queue = Queue()
|
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from contextlib import contextmanager
|
|
4
3
|
from dataclasses import dataclass, field
|
|
5
4
|
from time import time
|
|
6
|
-
from typing import TYPE_CHECKING,
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
7
6
|
|
|
8
7
|
from pydantic import Field, Secret
|
|
9
8
|
|
|
@@ -34,7 +33,7 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
34
33
|
)
|
|
35
34
|
|
|
36
35
|
if TYPE_CHECKING:
|
|
37
|
-
|
|
36
|
+
pass
|
|
38
37
|
|
|
39
38
|
CONNECTOR_TYPE = "dropbox"
|
|
40
39
|
|
|
@@ -46,32 +45,95 @@ class DropboxIndexerConfig(FsspecIndexerConfig):
|
|
|
46
45
|
|
|
47
46
|
|
|
48
47
|
class DropboxAccessConfig(FsspecAccessConfig):
|
|
49
|
-
token: Optional[str] = Field(
|
|
48
|
+
token: Optional[str] = Field(
|
|
49
|
+
default=None, description="Dropbox access token."
|
|
50
|
+
) # This is the short lived (4h) token that needs to be generated anew each time.
|
|
51
|
+
app_key: Optional[str] = Field(default=None, description="Dropbox app key.")
|
|
52
|
+
app_secret: Optional[str] = Field(default=None, description="Dropbox app secret.")
|
|
53
|
+
refresh_token: Optional[str] = Field(
|
|
54
|
+
default=None, description="Dropbox refresh token."
|
|
55
|
+
) # This is the long lived token that doesn't expire
|
|
50
56
|
|
|
51
57
|
|
|
52
58
|
class DropboxConnectionConfig(FsspecConnectionConfig):
|
|
53
|
-
supported_protocols: list[str] = field(default_factory=lambda: ["dropbox"], init=False)
|
|
54
59
|
access_config: Secret[DropboxAccessConfig] = Field(
|
|
55
60
|
default=DropboxAccessConfig(), validate_default=True
|
|
56
61
|
)
|
|
57
|
-
connector_type: str = Field(default=CONNECTOR_TYPE
|
|
58
|
-
|
|
59
|
-
@requires_dependencies(["
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
62
|
+
connector_type: str = Field(default=CONNECTOR_TYPE)
|
|
63
|
+
|
|
64
|
+
@requires_dependencies(["dropbox"])
|
|
65
|
+
def get_dropbox_access_token_from_refresh(
|
|
66
|
+
self,
|
|
67
|
+
refresh_token: str,
|
|
68
|
+
app_key: str,
|
|
69
|
+
app_secret: str,
|
|
70
|
+
) -> str:
|
|
71
|
+
"""
|
|
72
|
+
Uses the Dropbox Python SDK to exchange a long-lived refresh token for an access token.
|
|
73
|
+
"""
|
|
74
|
+
import dropbox
|
|
75
|
+
|
|
76
|
+
dbx = dropbox.Dropbox(
|
|
77
|
+
oauth2_access_token=None,
|
|
78
|
+
oauth2_refresh_token=refresh_token,
|
|
79
|
+
app_key=app_key,
|
|
80
|
+
app_secret=app_secret,
|
|
81
|
+
)
|
|
64
82
|
|
|
83
|
+
# This call fetches a new short-lived token and auto-updates dbx._oauth2_access_token
|
|
84
|
+
dbx.check_and_refresh_access_token()
|
|
85
|
+
short_lived_token = dbx._oauth2_access_token # Private attr, but standard usage
|
|
86
|
+
return short_lived_token
|
|
87
|
+
|
|
88
|
+
def get_access_config(self) -> dict[str, Any]:
|
|
89
|
+
"""
|
|
90
|
+
Overrides the parent FsspecConnectionConfig.get_access_config() to ensure
|
|
91
|
+
that we always provide an access token if refresh credentials exist.
|
|
92
|
+
"""
|
|
93
|
+
base_conf = super().get_access_config()
|
|
94
|
+
|
|
95
|
+
refresh_token = base_conf.get("refresh_token")
|
|
96
|
+
app_key = base_conf.get("app_key")
|
|
97
|
+
app_secret = base_conf.get("app_secret")
|
|
98
|
+
|
|
99
|
+
# Standard scenario - we have refresh a token and creds provided
|
|
100
|
+
# which we're going to use to retrieve access token
|
|
101
|
+
if refresh_token and app_key and app_secret:
|
|
102
|
+
logger.debug("Attempting to generate access token from refresh token...")
|
|
103
|
+
new_token = self.get_dropbox_access_token_from_refresh(
|
|
104
|
+
refresh_token=refresh_token,
|
|
105
|
+
app_key=app_key,
|
|
106
|
+
app_secret=app_secret,
|
|
107
|
+
)
|
|
108
|
+
if not new_token:
|
|
109
|
+
raise ValueError(
|
|
110
|
+
"Unable to retrieve an access token from Dropbox. "
|
|
111
|
+
"Please check that your refresh token, app key, and secret are valid."
|
|
112
|
+
)
|
|
113
|
+
base_conf["token"] = new_token
|
|
114
|
+
elif not base_conf.get("token"): # we might already have an access token from outside
|
|
115
|
+
# We have neither an existing short?lived token nor refresh credentials
|
|
116
|
+
raise ValueError(
|
|
117
|
+
"No valid token or refresh_token with app credentials was found. "
|
|
118
|
+
"Please check that your refresh token, app key, and secret are valid "
|
|
119
|
+
"or provide a valid short-lived token"
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
return base_conf
|
|
123
|
+
|
|
124
|
+
@requires_dependencies(["dropbox"])
|
|
65
125
|
def wrap_error(self, e: Exception) -> Exception:
|
|
66
126
|
from dropbox.exceptions import AuthError, HttpError, RateLimitError
|
|
67
127
|
|
|
68
128
|
if not isinstance(e, HttpError):
|
|
69
|
-
logger.error(f"
|
|
129
|
+
logger.error(f"Unhandled Dropbox exception: {repr(e)}", exc_info=True)
|
|
70
130
|
return e
|
|
131
|
+
|
|
71
132
|
if isinstance(e, AuthError):
|
|
72
133
|
raise UserAuthError(e.error)
|
|
73
|
-
|
|
134
|
+
elif isinstance(e, RateLimitError):
|
|
74
135
|
return CustomRateLimitError(e.error)
|
|
136
|
+
|
|
75
137
|
status_code = e.status_code
|
|
76
138
|
if 400 <= status_code < 500:
|
|
77
139
|
if body := getattr(e, "body", None):
|
|
@@ -83,7 +145,8 @@ class DropboxConnectionConfig(FsspecConnectionConfig):
|
|
|
83
145
|
return ProviderError(body)
|
|
84
146
|
else:
|
|
85
147
|
return ProviderError(e.body)
|
|
86
|
-
|
|
148
|
+
|
|
149
|
+
logger.error(f"Unhandled Dropbox HttpError: {repr(e)}", exc_info=True)
|
|
87
150
|
return e
|
|
88
151
|
|
|
89
152
|
|
|
@@ -0,0 +1,453 @@
|
|
|
1
|
+
import math
|
|
2
|
+
from collections import abc
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional, Union
|
|
7
|
+
|
|
8
|
+
from pydantic import Field, Secret
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.error import SourceConnectionError
|
|
11
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
|
+
from unstructured_ingest.v2.interfaces import (
|
|
13
|
+
AccessConfig,
|
|
14
|
+
ConnectionConfig,
|
|
15
|
+
Downloader,
|
|
16
|
+
DownloaderConfig,
|
|
17
|
+
DownloadResponse,
|
|
18
|
+
FileData,
|
|
19
|
+
FileDataSourceMetadata,
|
|
20
|
+
Indexer,
|
|
21
|
+
IndexerConfig,
|
|
22
|
+
SourceIdentifiers,
|
|
23
|
+
)
|
|
24
|
+
from unstructured_ingest.v2.logger import logger
|
|
25
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
26
|
+
SourceRegistryEntry,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
from atlassian import Jira
|
|
31
|
+
|
|
32
|
+
CONNECTOR_TYPE = "jira"
|
|
33
|
+
|
|
34
|
+
DEFAULT_C_SEP = " " * 5
|
|
35
|
+
DEFAULT_R_SEP = "\n"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class JiraIssueMetadata:
|
|
40
|
+
id: str
|
|
41
|
+
key: str
|
|
42
|
+
board_id: Optional[str] = None
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def project_id(self) -> str:
|
|
46
|
+
return self.key.split("-")[0]
|
|
47
|
+
|
|
48
|
+
def to_dict(self) -> Dict[str, Union[str, None]]:
|
|
49
|
+
return {
|
|
50
|
+
"id": self.id,
|
|
51
|
+
"key": self.key,
|
|
52
|
+
"board_id": self.board_id,
|
|
53
|
+
"project_id": self.project_id,
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class FieldGetter(dict):
|
|
58
|
+
def __getitem__(self, key):
|
|
59
|
+
value = super().__getitem__(key) if key in self else None
|
|
60
|
+
if value is None:
|
|
61
|
+
value = FieldGetter({})
|
|
62
|
+
return value
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def nested_object_to_field_getter(obj: dict) -> Union[FieldGetter, dict]:
|
|
66
|
+
if isinstance(obj, abc.Mapping):
|
|
67
|
+
new_object = {}
|
|
68
|
+
for k, v in obj.items():
|
|
69
|
+
if isinstance(v, abc.Mapping):
|
|
70
|
+
new_object[k] = FieldGetter(nested_object_to_field_getter(v))
|
|
71
|
+
else:
|
|
72
|
+
new_object[k] = v
|
|
73
|
+
return FieldGetter(new_object)
|
|
74
|
+
else:
|
|
75
|
+
return obj
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def issues_fetcher_wrapper(func, results_key="results", number_of_issues_to_fetch: int = 100):
|
|
79
|
+
"""
|
|
80
|
+
A decorator function that wraps around a function to fetch issues from Jira API in a paginated
|
|
81
|
+
manner. This is required because the Jira API has a limit of 100 issues per request.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
func (callable): The function to be wrapped. This function should accept `limit` and `start`
|
|
85
|
+
as keyword arguments.
|
|
86
|
+
results_key (str, optional): The key in the response dictionary that contains the list of
|
|
87
|
+
results. Defaults to "results".
|
|
88
|
+
number_of_issues_to_fetch (int, optional): The total number of issues to fetch. Defaults to
|
|
89
|
+
100.
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
list: A list of all fetched issues.
|
|
93
|
+
|
|
94
|
+
Raises:
|
|
95
|
+
KeyError: If the response dictionary does not contain the specified `results_key`.
|
|
96
|
+
TypeError: If the response type from the Jira API is neither list nor dict.
|
|
97
|
+
"""
|
|
98
|
+
|
|
99
|
+
def wrapper(*args, **kwargs) -> list:
|
|
100
|
+
kwargs["limit"] = min(100, number_of_issues_to_fetch)
|
|
101
|
+
kwargs["start"] = kwargs.get("start", 0)
|
|
102
|
+
|
|
103
|
+
all_results = []
|
|
104
|
+
num_iterations = math.ceil(number_of_issues_to_fetch / kwargs["limit"])
|
|
105
|
+
|
|
106
|
+
for _ in range(num_iterations):
|
|
107
|
+
response = func(*args, **kwargs)
|
|
108
|
+
if isinstance(response, list):
|
|
109
|
+
all_results += response
|
|
110
|
+
elif isinstance(response, dict):
|
|
111
|
+
if results_key not in response:
|
|
112
|
+
raise KeyError(f'Response object is missing "{results_key}" key.')
|
|
113
|
+
all_results += response[results_key]
|
|
114
|
+
else:
|
|
115
|
+
raise TypeError(
|
|
116
|
+
f"""Unexpected response type from Jira API.
|
|
117
|
+
Response type has to be either list or dict, got: {type(response).__name__}."""
|
|
118
|
+
)
|
|
119
|
+
kwargs["start"] += kwargs["limit"]
|
|
120
|
+
|
|
121
|
+
return all_results
|
|
122
|
+
|
|
123
|
+
return wrapper
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class JiraAccessConfig(AccessConfig):
|
|
127
|
+
password: Optional[str] = Field(
|
|
128
|
+
description="Jira password or Cloud API token",
|
|
129
|
+
default=None,
|
|
130
|
+
)
|
|
131
|
+
token: Optional[str] = Field(
|
|
132
|
+
description="Jira Personal Access Token",
|
|
133
|
+
default=None,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class JiraConnectionConfig(ConnectionConfig):
|
|
138
|
+
url: str = Field(description="URL of the Jira instance")
|
|
139
|
+
username: Optional[str] = Field(
|
|
140
|
+
description="Username or email for authentication",
|
|
141
|
+
default=None,
|
|
142
|
+
)
|
|
143
|
+
cloud: bool = Field(description="Authenticate to Jira Cloud", default=False)
|
|
144
|
+
access_config: Secret[JiraAccessConfig] = Field(description="Access configuration for Jira")
|
|
145
|
+
|
|
146
|
+
def model_post_init(self, __context):
|
|
147
|
+
access_configs = self.access_config.get_secret_value()
|
|
148
|
+
basic_auth = self.username and access_configs.password
|
|
149
|
+
pat_auth = access_configs.token
|
|
150
|
+
if self.cloud and not basic_auth:
|
|
151
|
+
raise ValueError(
|
|
152
|
+
"cloud authentication requires username and API token (--password), "
|
|
153
|
+
"see: https://atlassian-python-api.readthedocs.io/"
|
|
154
|
+
)
|
|
155
|
+
if basic_auth and pat_auth:
|
|
156
|
+
raise ValueError(
|
|
157
|
+
"both password and token provided, only one allowed, "
|
|
158
|
+
"see: https://atlassian-python-api.readthedocs.io/"
|
|
159
|
+
)
|
|
160
|
+
if not (basic_auth or pat_auth):
|
|
161
|
+
raise ValueError(
|
|
162
|
+
"no form of auth provided, see: https://atlassian-python-api.readthedocs.io/"
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
@requires_dependencies(["atlassian"], extras="jira")
|
|
166
|
+
@contextmanager
|
|
167
|
+
def get_client(self) -> Generator["Jira", None, None]:
|
|
168
|
+
from atlassian import Jira
|
|
169
|
+
|
|
170
|
+
access_configs = self.access_config.get_secret_value()
|
|
171
|
+
with Jira(
|
|
172
|
+
url=self.url,
|
|
173
|
+
username=self.username,
|
|
174
|
+
password=access_configs.password,
|
|
175
|
+
token=access_configs.token,
|
|
176
|
+
cloud=self.cloud,
|
|
177
|
+
) as client:
|
|
178
|
+
yield client
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
class JiraIndexerConfig(IndexerConfig):
|
|
182
|
+
projects: Optional[List[str]] = Field(None, description="List of project keys")
|
|
183
|
+
boards: Optional[List[str]] = Field(None, description="List of board IDs")
|
|
184
|
+
issues: Optional[List[str]] = Field(None, description="List of issue keys or IDs")
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
@dataclass
|
|
188
|
+
class JiraIndexer(Indexer):
|
|
189
|
+
connection_config: JiraConnectionConfig
|
|
190
|
+
index_config: JiraIndexerConfig
|
|
191
|
+
connector_type: str = CONNECTOR_TYPE
|
|
192
|
+
|
|
193
|
+
def precheck(self) -> None:
|
|
194
|
+
try:
|
|
195
|
+
with self.connection_config.get_client() as client:
|
|
196
|
+
response = client.get_permissions("BROWSE_PROJECTS")
|
|
197
|
+
permitted = response["permissions"]["BROWSE_PROJECTS"]["havePermission"]
|
|
198
|
+
except Exception as e:
|
|
199
|
+
logger.error(f"Failed to connect to Jira: {e}", exc_info=True)
|
|
200
|
+
raise SourceConnectionError(f"Failed to connect to Jira: {e}")
|
|
201
|
+
if not permitted:
|
|
202
|
+
raise ValueError(
|
|
203
|
+
"""The provided user is not permitted to browse projects
|
|
204
|
+
from the given Jira organization URL.
|
|
205
|
+
Try checking username, password, token and the url arguments.""",
|
|
206
|
+
)
|
|
207
|
+
logger.info("Connection to Jira successful.")
|
|
208
|
+
|
|
209
|
+
def _get_issues_within_single_project(self, project_key: str) -> List[JiraIssueMetadata]:
|
|
210
|
+
with self.connection_config.get_client() as client:
|
|
211
|
+
number_of_issues_to_fetch = client.get_project_issues_count(project=project_key)
|
|
212
|
+
if isinstance(number_of_issues_to_fetch, dict):
|
|
213
|
+
if "total" not in number_of_issues_to_fetch:
|
|
214
|
+
raise KeyError('Response object is missing "total" key.')
|
|
215
|
+
number_of_issues_to_fetch = number_of_issues_to_fetch["total"]
|
|
216
|
+
if not number_of_issues_to_fetch:
|
|
217
|
+
logger.warning(f"No issues found in project: {project_key}. Skipping!")
|
|
218
|
+
return []
|
|
219
|
+
get_project_issues = issues_fetcher_wrapper(
|
|
220
|
+
client.get_all_project_issues,
|
|
221
|
+
results_key="issues",
|
|
222
|
+
number_of_issues_to_fetch=number_of_issues_to_fetch,
|
|
223
|
+
)
|
|
224
|
+
issues = get_project_issues(project=project_key, fields=["key", "id"])
|
|
225
|
+
logger.debug(f"Found {len(issues)} issues in project: {project_key}")
|
|
226
|
+
return [JiraIssueMetadata(id=issue["id"], key=issue["key"]) for issue in issues]
|
|
227
|
+
|
|
228
|
+
def _get_issues_within_projects(self) -> List[JiraIssueMetadata]:
|
|
229
|
+
project_keys = self.index_config.projects
|
|
230
|
+
if not project_keys:
|
|
231
|
+
# for when a component list is provided, without any projects
|
|
232
|
+
if self.index_config.boards or self.index_config.issues:
|
|
233
|
+
return []
|
|
234
|
+
# for when no components are provided. all projects will be ingested
|
|
235
|
+
else:
|
|
236
|
+
with self.connection_config.get_client() as client:
|
|
237
|
+
project_keys = [project["key"] for project in client.projects()]
|
|
238
|
+
return [
|
|
239
|
+
issue
|
|
240
|
+
for project_key in project_keys
|
|
241
|
+
for issue in self._get_issues_within_single_project(project_key)
|
|
242
|
+
]
|
|
243
|
+
|
|
244
|
+
def _get_issues_within_single_board(self, board_id: str) -> List[JiraIssueMetadata]:
|
|
245
|
+
with self.connection_config.get_client() as client:
|
|
246
|
+
get_board_issues = issues_fetcher_wrapper(
|
|
247
|
+
client.get_issues_for_board,
|
|
248
|
+
results_key="issues",
|
|
249
|
+
)
|
|
250
|
+
issues = get_board_issues(board_id=board_id, fields=["key", "id"], jql=None)
|
|
251
|
+
logger.debug(f"Found {len(issues)} issues in board: {board_id}")
|
|
252
|
+
return [
|
|
253
|
+
JiraIssueMetadata(id=issue["id"], key=issue["key"], board_id=board_id)
|
|
254
|
+
for issue in issues
|
|
255
|
+
]
|
|
256
|
+
|
|
257
|
+
def _get_issues_within_boards(self) -> List[JiraIssueMetadata]:
|
|
258
|
+
if not self.index_config.boards:
|
|
259
|
+
return []
|
|
260
|
+
return [
|
|
261
|
+
issue
|
|
262
|
+
for board_id in self.index_config.boards
|
|
263
|
+
for issue in self._get_issues_within_single_board(board_id)
|
|
264
|
+
]
|
|
265
|
+
|
|
266
|
+
def _get_issues(self) -> List[JiraIssueMetadata]:
|
|
267
|
+
with self.connection_config.get_client() as client:
|
|
268
|
+
issues = [
|
|
269
|
+
client.get_issue(issue_id_or_key=issue_key, fields=["key", "id"])
|
|
270
|
+
for issue_key in self.index_config.issues or []
|
|
271
|
+
]
|
|
272
|
+
return [JiraIssueMetadata(id=issue["id"], key=issue["key"]) for issue in issues]
|
|
273
|
+
|
|
274
|
+
def get_issues(self) -> List[JiraIssueMetadata]:
|
|
275
|
+
issues = [
|
|
276
|
+
*self._get_issues_within_boards(),
|
|
277
|
+
*self._get_issues_within_projects(),
|
|
278
|
+
*self._get_issues(),
|
|
279
|
+
]
|
|
280
|
+
# Select unique issues by issue 'id'.
|
|
281
|
+
# Since boards issues are fetched first,
|
|
282
|
+
# if there are duplicates, the board issues will be kept,
|
|
283
|
+
# in order to keep issue 'board_id' information.
|
|
284
|
+
seen = set()
|
|
285
|
+
unique_issues: List[JiraIssueMetadata] = []
|
|
286
|
+
for issue in issues:
|
|
287
|
+
if issue.id not in seen:
|
|
288
|
+
unique_issues.append(issue)
|
|
289
|
+
seen.add(issue.id)
|
|
290
|
+
return unique_issues
|
|
291
|
+
|
|
292
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
293
|
+
from time import time
|
|
294
|
+
|
|
295
|
+
issues = self.get_issues()
|
|
296
|
+
for issue in issues:
|
|
297
|
+
# Build metadata
|
|
298
|
+
metadata = FileDataSourceMetadata(
|
|
299
|
+
date_processed=str(time()),
|
|
300
|
+
record_locator=issue.to_dict(),
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
# Construct relative path and filename
|
|
304
|
+
filename = f"{issue.id}.txt"
|
|
305
|
+
relative_path = str(Path(issue.project_id) / filename)
|
|
306
|
+
|
|
307
|
+
source_identifiers = SourceIdentifiers(
|
|
308
|
+
filename=filename,
|
|
309
|
+
fullpath=relative_path,
|
|
310
|
+
rel_path=relative_path,
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
file_data = FileData(
|
|
314
|
+
identifier=issue.id,
|
|
315
|
+
connector_type=self.connector_type,
|
|
316
|
+
metadata=metadata,
|
|
317
|
+
additional_metadata=issue.to_dict(),
|
|
318
|
+
source_identifiers=source_identifiers,
|
|
319
|
+
)
|
|
320
|
+
yield file_data
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
class JiraDownloaderConfig(DownloaderConfig):
|
|
324
|
+
pass
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
@dataclass
|
|
328
|
+
class JiraDownloader(Downloader):
|
|
329
|
+
connection_config: JiraConnectionConfig
|
|
330
|
+
download_config: JiraDownloaderConfig = field(default_factory=JiraDownloaderConfig)
|
|
331
|
+
connector_type: str = CONNECTOR_TYPE
|
|
332
|
+
|
|
333
|
+
def _get_id_fields_for_issue(
|
|
334
|
+
self, issue: dict, c_sep: str = DEFAULT_C_SEP, r_sep: str = DEFAULT_R_SEP
|
|
335
|
+
) -> str:
|
|
336
|
+
issue_id, key = issue["id"], issue["key"]
|
|
337
|
+
return f"IssueID_IssueKey:{issue_id}{c_sep}{key}{r_sep}"
|
|
338
|
+
|
|
339
|
+
def _get_project_fields_for_issue(
|
|
340
|
+
self, issue: dict, c_sep: str = DEFAULT_C_SEP, r_sep: str = DEFAULT_R_SEP
|
|
341
|
+
) -> str:
|
|
342
|
+
if "project" in issue:
|
|
343
|
+
return (
|
|
344
|
+
f"ProjectID_Key:{issue['project']['key']}{c_sep}{issue['project']['name']}{r_sep}"
|
|
345
|
+
)
|
|
346
|
+
else:
|
|
347
|
+
return ""
|
|
348
|
+
|
|
349
|
+
def _get_dropdown_fields_for_issue(
|
|
350
|
+
self, issue: dict, c_sep: str = DEFAULT_C_SEP, r_sep: str = DEFAULT_R_SEP
|
|
351
|
+
) -> str:
|
|
352
|
+
return f"""
|
|
353
|
+
IssueType:{issue["issuetype"]["name"]}
|
|
354
|
+
{r_sep}
|
|
355
|
+
Status:{issue["status"]["name"]}
|
|
356
|
+
{r_sep}
|
|
357
|
+
Priority:{issue["priority"]}
|
|
358
|
+
{r_sep}
|
|
359
|
+
AssigneeID_Name:{issue["assignee"]["accountId"]}{c_sep}{issue["assignee"]["displayName"]}
|
|
360
|
+
{r_sep}
|
|
361
|
+
ReporterAdr_Name:{issue["reporter"]["emailAddress"]}{c_sep}{issue["reporter"]["displayName"]}
|
|
362
|
+
{r_sep}
|
|
363
|
+
Labels:{c_sep.join(issue["labels"])}
|
|
364
|
+
{r_sep}
|
|
365
|
+
Components:{c_sep.join([component["name"] for component in issue["components"]])}
|
|
366
|
+
{r_sep}
|
|
367
|
+
"""
|
|
368
|
+
|
|
369
|
+
def _get_subtasks_for_issue(self, issue: dict) -> str:
|
|
370
|
+
return ""
|
|
371
|
+
|
|
372
|
+
def _get_text_fields_for_issue(
|
|
373
|
+
self, issue: dict, c_sep: str = DEFAULT_C_SEP, r_sep: str = DEFAULT_R_SEP
|
|
374
|
+
) -> str:
|
|
375
|
+
return f"""
|
|
376
|
+
{issue["summary"]}
|
|
377
|
+
{r_sep}
|
|
378
|
+
{issue["description"]}
|
|
379
|
+
{r_sep}
|
|
380
|
+
{c_sep.join([attachment["self"] for attachment in issue["attachment"]])}
|
|
381
|
+
{r_sep}
|
|
382
|
+
"""
|
|
383
|
+
|
|
384
|
+
def _get_comments_for_issue(
|
|
385
|
+
self, issue: dict, c_sep: str = DEFAULT_C_SEP, r_sep: str = DEFAULT_R_SEP
|
|
386
|
+
) -> str:
|
|
387
|
+
return c_sep.join(
|
|
388
|
+
[self._get_fields_for_comment(comment) for comment in issue["comment"]["comments"]],
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
def _get_fields_for_comment(
|
|
392
|
+
self, comment, c_sep: str = DEFAULT_C_SEP, r_sep: str = DEFAULT_R_SEP
|
|
393
|
+
) -> str:
|
|
394
|
+
return f"{comment['author']['displayName']}{c_sep}{comment['body']}{r_sep}"
|
|
395
|
+
|
|
396
|
+
def form_templated_string(
|
|
397
|
+
self,
|
|
398
|
+
issue: dict,
|
|
399
|
+
parsed_fields: Union[FieldGetter, dict],
|
|
400
|
+
c_sep: str = "|||",
|
|
401
|
+
r_sep: str = "\n\n\n",
|
|
402
|
+
) -> str:
|
|
403
|
+
"""Forms a template string via parsing the fields from the API response object on the issue
|
|
404
|
+
The template string will be saved to the disk, and then will be processed by partition."""
|
|
405
|
+
return r_sep.join(
|
|
406
|
+
[
|
|
407
|
+
self._get_id_fields_for_issue(issue),
|
|
408
|
+
self._get_project_fields_for_issue(parsed_fields),
|
|
409
|
+
self._get_dropdown_fields_for_issue(parsed_fields),
|
|
410
|
+
self._get_subtasks_for_issue(parsed_fields),
|
|
411
|
+
self._get_comments_for_issue(parsed_fields),
|
|
412
|
+
self._get_text_fields_for_issue(parsed_fields),
|
|
413
|
+
],
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
def update_file_data(self, file_data: FileData, issue: dict) -> None:
|
|
417
|
+
file_data.metadata.date_created = issue["fields"]["created"]
|
|
418
|
+
file_data.metadata.date_modified = issue["fields"]["updated"]
|
|
419
|
+
file_data.display_name = issue["fields"]["project"]["name"]
|
|
420
|
+
|
|
421
|
+
def get_issue(self, issue_key: str) -> dict:
|
|
422
|
+
try:
|
|
423
|
+
with self.connection_config.get_client() as client:
|
|
424
|
+
return client.issue(key=issue_key)
|
|
425
|
+
except Exception as e:
|
|
426
|
+
logger.error(f"Failed to fetch issue with key: {issue_key}: {e}", exc_info=True)
|
|
427
|
+
raise SourceConnectionError(f"Failed to fetch issue with key: {issue_key}: {e}")
|
|
428
|
+
|
|
429
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
430
|
+
issue_key = file_data.additional_metadata.get("key")
|
|
431
|
+
if not issue_key:
|
|
432
|
+
raise ValueError("Issue key not found in metadata.")
|
|
433
|
+
issue = self.get_issue(issue_key)
|
|
434
|
+
parsed_fields = nested_object_to_field_getter(issue["fields"])
|
|
435
|
+
issue_str = self.form_templated_string(issue, parsed_fields)
|
|
436
|
+
|
|
437
|
+
download_path = self.get_download_path(file_data)
|
|
438
|
+
if download_path is None:
|
|
439
|
+
raise ValueError("File data is missing source identifiers data.")
|
|
440
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
441
|
+
with open(download_path, "w") as f:
|
|
442
|
+
f.write(issue_str)
|
|
443
|
+
self.update_file_data(file_data, issue)
|
|
444
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
jira_source_entry = SourceRegistryEntry(
|
|
448
|
+
connection_config=JiraConnectionConfig,
|
|
449
|
+
indexer_config=JiraIndexerConfig,
|
|
450
|
+
indexer=JiraIndexer,
|
|
451
|
+
downloader_config=JiraDownloaderConfig,
|
|
452
|
+
downloader=JiraDownloader,
|
|
453
|
+
)
|