unstructured-ingest 1.0.46.dev0__py3-none-any.whl → 1.0.48__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1 +1 @@
1
- __version__ = "1.0.46-dev0" # pragma: no cover
1
+ __version__ = "1.0.48" # pragma: no cover
@@ -134,8 +134,15 @@ class DeltaTableUploader(Uploader):
134
134
 
135
135
  response = s3_client.get_bucket_location(Bucket=bucket_name)
136
136
 
137
- if self.connection_config.aws_region != response.get("LocationConstraint"):
138
- raise ValueError("Wrong AWS Region was provided.")
137
+ bucket_region = _normalize_location_constraint(response.get("LocationConstraint"))
138
+
139
+ if self.connection_config.aws_region != bucket_region:
140
+ raise ValueError(
141
+ "Wrong AWS region provided: bucket "
142
+ f"'{bucket_name}' resides in '{bucket_region}', "
143
+ "but configuration specifies "
144
+ f"'{self.connection_config.aws_region}'."
145
+ )
139
146
 
140
147
  except Exception as e:
141
148
  logger.error(f"failed to validate connection: {e}", exc_info=True)
@@ -270,6 +277,29 @@ class DeltaTableUploader(Uploader):
270
277
  self.upload_dataframe(df=df, file_data=file_data)
271
278
 
272
279
 
280
+ def _normalize_location_constraint(location: Optional[str]) -> str:
281
+ """Return canonical AWS region name for a LocationConstraint value.
282
+
283
+ The S3 GetBucketLocation operation returns `null` (`None`) for buckets in
284
+ the legacy `us-east-1` region and `EU` for very old buckets that were
285
+ created in the historical `EU` region (now `eu-west-1`). For every other
286
+ region the API already returns the correct AWS region string. This helper
287
+ normalises the legacy values so callers can reliably compare regions.
288
+
289
+ Args:
290
+ location: The LocationConstraint value returned by the S3 GetBucketLocation operation.
291
+
292
+ Returns:
293
+ The canonical AWS region name for the given location constraint.
294
+ """
295
+
296
+ if location is None:
297
+ return "us-east-1"
298
+ if location == "EU":
299
+ return "eu-west-1"
300
+ return location
301
+
302
+
273
303
  delta_table_destination_entry = DestinationRegistryEntry(
274
304
  connection_config=DeltaTableConnectionConfig,
275
305
  uploader=DeltaTableUploader,
@@ -41,10 +41,17 @@ DEFAULT_R_SEP = "\n"
41
41
  class JiraIssueMetadata(BaseModel):
42
42
  id: str
43
43
  key: str
44
+ fields: Optional[dict] = None # Add fields to capture attachment data
44
45
 
45
46
  def get_project_id(self) -> str:
46
47
  return self.key.split("-")[0]
47
48
 
49
+ def get_attachments(self) -> List[dict]:
50
+ """Extract attachment information from fields"""
51
+ if self.fields and "attachment" in self.fields:
52
+ return self.fields["attachment"]
53
+ return []
54
+
48
55
 
49
56
  class FieldGetter(dict):
50
57
  def __getitem__(self, key):
@@ -196,15 +203,17 @@ class JiraIndexer(Indexer):
196
203
  yield JiraIssueMetadata.model_validate(issue)
197
204
 
198
205
  def _get_issues_within_projects(self) -> Generator[JiraIssueMetadata, None, None]:
199
- fields = ["key", "id", "status"]
206
+ fields = ["key", "id", "status", "attachment"] # Add attachment field
200
207
  jql = "project in ({})".format(", ".join(self.index_config.projects))
201
208
  jql = self._update_jql(jql)
202
209
  logger.debug(f"running jql: {jql}")
203
210
  return self.run_jql(jql=jql, fields=fields)
204
211
 
205
- def _get_issues_within_single_board(self, board_id: str) -> List[JiraIssueMetadata]:
212
+ def _get_issues_within_single_board(
213
+ self, board_id: str
214
+ ) -> Generator[JiraIssueMetadata, None, None]:
206
215
  with self.connection_config.get_client() as client:
207
- fields = ["key", "id"]
216
+ fields = ["key", "id", "attachment"] # Add attachment field
208
217
  if self.index_config.status_filters:
209
218
  jql = "status in ({}) ORDER BY id".format(
210
219
  ", ".join([f'"{s}"' for s in self.index_config.status_filters])
@@ -233,23 +242,38 @@ class JiraIndexer(Indexer):
233
242
  return jql
234
243
 
235
244
  def _get_issues_by_keys(self) -> Generator[JiraIssueMetadata, None, None]:
236
- fields = ["key", "id"]
245
+ fields = ["key", "id", "attachment"] # Add attachment field
237
246
  jql = "key in ({})".format(", ".join(self.index_config.issues))
238
247
  jql = self._update_jql(jql)
239
248
  logger.debug(f"running jql: {jql}")
240
249
  return self.run_jql(jql=jql, fields=fields)
241
250
 
242
251
  def _create_file_data_from_issue(self, issue: JiraIssueMetadata) -> FileData:
243
- # Build metadata
252
+ # Construct relative path and filename first
253
+ filename = f"{issue.key}.txt"
254
+ relative_path = str(Path(issue.get_project_id()) / filename)
255
+
256
+ # Build metadata with attachments included in record_locator
257
+ record_locator = {"id": issue.id, "key": issue.key, "full_path": relative_path}
258
+
259
+ # Add attachments to record_locator if they exist
260
+ attachments = issue.get_attachments()
261
+ if attachments:
262
+ record_locator["attachments"] = [
263
+ {
264
+ "id": att["id"],
265
+ "filename": att["filename"],
266
+ "created": att.get("created"),
267
+ "mimeType": att.get("mimeType"),
268
+ }
269
+ for att in attachments
270
+ ]
271
+
244
272
  metadata = FileDataSourceMetadata(
245
273
  date_processed=str(time()),
246
- record_locator=issue.model_dump(),
274
+ record_locator=record_locator,
247
275
  )
248
276
 
249
- # Construct relative path and filename
250
- filename = f"{issue.id}.txt"
251
- relative_path = str(Path(issue.get_project_id()) / filename)
252
-
253
277
  source_identifiers = SourceIdentifiers(
254
278
  filename=filename,
255
279
  fullpath=relative_path,
@@ -400,21 +424,37 @@ class JiraDownloader(Downloader):
400
424
  self, attachment_dict: dict, parent_filedata: FileData
401
425
  ) -> FileData:
402
426
  new_filedata = parent_filedata.model_copy(deep=True)
403
- if new_filedata.metadata.record_locator is None:
404
- new_filedata.metadata.record_locator = {}
405
- new_filedata.metadata.record_locator["parent_issue"] = (
406
- parent_filedata.metadata.record_locator["id"]
407
- )
427
+
428
+ # Create attachment record_locator with parent context
429
+ attachment_record_locator = {
430
+ "id": attachment_dict["id"],
431
+ "filename": attachment_dict["filename"],
432
+ "created": attachment_dict.get("created"),
433
+ "mimeType": attachment_dict.get("mimeType"),
434
+ "parent": {
435
+ "id": parent_filedata.metadata.record_locator["id"],
436
+ "key": parent_filedata.metadata.record_locator["key"],
437
+ "full_path": parent_filedata.source_identifiers.fullpath,
438
+ },
439
+ }
440
+
408
441
  # Append an identifier for attachment to not conflict with issue ids
409
442
  new_filedata.identifier = "{}a".format(attachment_dict["id"])
410
- filename = attachment_dict["filename"]
411
- new_filedata.metadata.filesize_bytes = attachment_dict.pop("size", None)
412
- new_filedata.metadata.date_created = attachment_dict.pop("created", None)
413
- new_filedata.metadata.url = attachment_dict.pop("self", None)
414
- new_filedata.metadata.record_locator = attachment_dict
443
+ filename = f"{attachment_dict['filename']}.{attachment_dict['id']}"
444
+ new_filedata.metadata.filesize_bytes = attachment_dict.get("size")
445
+ new_filedata.metadata.date_created = attachment_dict.get("created")
446
+ new_filedata.metadata.url = attachment_dict.get("self")
447
+ new_filedata.metadata.record_locator = attachment_record_locator
448
+ full_path = (
449
+ Path(parent_filedata.source_identifiers.fullpath).with_suffix("") / Path(filename)
450
+ ).as_posix()
451
+ new_filedata.metadata.record_locator["full_path"] = full_path
415
452
  new_filedata.source_identifiers = SourceIdentifiers(
416
453
  filename=filename,
417
- fullpath=(Path(str(attachment_dict["id"])) / Path(filename)).as_posix(),
454
+ # add issue_parent to the fullpath and rel_path
455
+ # to ensure that the attachment is saved in the same folder as the parent issue
456
+ fullpath=full_path,
457
+ rel_path=full_path,
418
458
  )
419
459
  return new_filedata
420
460
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: unstructured_ingest
3
- Version: 1.0.46.dev0
3
+ Version: 1.0.48
4
4
  Summary: Local ETL data pipeline to get data RAG ready
5
5
  Author-email: Unstructured Technologies <devops@unstructuredai.io>
6
6
  License-Expression: Apache-2.0
@@ -1,5 +1,5 @@
1
1
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
2
- unstructured_ingest/__version__.py,sha256=IKm0h9VzCLTBY4HULhS6HxTXk7DRRvPTI9GEd28jGJM,48
2
+ unstructured_ingest/__version__.py,sha256=oxIYGy4gQ782VEa3kKpqmMWHoZ9FUqF1PhsN2QyWE88,43
3
3
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
4
4
  unstructured_ingest/errors_v2.py,sha256=9RuRCi7lbDxCguDz07y5RiHoQiFIOWwOD7xqzJ2B3Yw,436
5
5
  unstructured_ingest/logger.py,sha256=7e_7UeK6hVOd5BQ6i9NzRUAPCS_DF839Y8TjUDywraY,1428
@@ -68,12 +68,12 @@ unstructured_ingest/processes/connectors/azure_ai_search.py,sha256=szhSRXzUHk0DE
68
68
  unstructured_ingest/processes/connectors/chroma.py,sha256=q5_Fu4xb6_W_NyrPxVa3-jVwZLqVdlBNlR4dFvbd7l0,7235
69
69
  unstructured_ingest/processes/connectors/confluence.py,sha256=aA2B_FPdAjlVAJtmMldYu6lld2sR-6JL5tWh7yItiwg,22828
70
70
  unstructured_ingest/processes/connectors/couchbase.py,sha256=KCHoYDNya9B05NIB5D78zXoizFyfpJRepcYBe1nLSOs,12298
71
- unstructured_ingest/processes/connectors/delta_table.py,sha256=JrpiX9V-YD1VhExKi6KFwlYatCheSs3t-xB3Td1BVFk,11487
71
+ unstructured_ingest/processes/connectors/delta_table.py,sha256=Y0P-4knPBc7Q8QwlvlDe6ksIKppNY4dBZhC1vwGARi0,12661
72
72
  unstructured_ingest/processes/connectors/discord.py,sha256=CD-SBECMdr3pnmqbPvBMyPU2cBroXUhyW6F7L3laP6A,5348
73
73
  unstructured_ingest/processes/connectors/github.py,sha256=smHCz6jOH1p_hW2S25bYunBBj_pYjz8HTw6wkzaJz_A,7765
74
74
  unstructured_ingest/processes/connectors/gitlab.py,sha256=Fdq6_lk-By1JDmLGVjoKJkaHESiKTZsbvoHhMsljlE0,10114
75
75
  unstructured_ingest/processes/connectors/google_drive.py,sha256=jQb4_rKL_tJg7s7m-H8nrvc0GKwxiubtg8KL3-ZIGPM,35304
76
- unstructured_ingest/processes/connectors/jira.py,sha256=BuZwExmdcI-R_MGPUwm8TnFh2jEjjwkyA1T51Bgqh-U,18558
76
+ unstructured_ingest/processes/connectors/jira.py,sha256=DCP1pSagcjmQP0he1N726WNLsiHXlTUuhFwAtxt4zs8,20278
77
77
  unstructured_ingest/processes/connectors/kdbai.py,sha256=XhxYpKSAoFPBsDQWwNuLX03DCxOVr7yquj9VYM55Rtc,5174
78
78
  unstructured_ingest/processes/connectors/local.py,sha256=CesMduUiSPqdJpqIyW28icGvGAo4hfa-4fzbYajmMSo,7450
79
79
  unstructured_ingest/processes/connectors/milvus.py,sha256=L-PM5osheNyNsLGYZmiF3rRmeulp7Ejk92JCoaQ_F9Y,12075
@@ -231,8 +231,8 @@ unstructured_ingest/utils/ndjson.py,sha256=nz8VUOPEgAFdhaDOpuveknvCU4x82fVwqE01q
231
231
  unstructured_ingest/utils/pydantic_models.py,sha256=BT_j15e4rX40wQbt8LUXbqfPhA3rJn1PHTI_G_A_EHY,1720
232
232
  unstructured_ingest/utils/string_and_date_utils.py,sha256=oXOI6rxXq-8ncbk7EoJK0WCcTXWj75EzKl8pfQMID3U,2522
233
233
  unstructured_ingest/utils/table.py,sha256=WZechczgVFvlodUWFcsnCGvBNh1xRm6hr0VbJTPxKAc,3669
234
- unstructured_ingest-1.0.46.dev0.dist-info/METADATA,sha256=nT9dK2ltS-aMKLa_nCewjwQSUYlaDVnvNUNoWDwjHPw,8847
235
- unstructured_ingest-1.0.46.dev0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
236
- unstructured_ingest-1.0.46.dev0.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
237
- unstructured_ingest-1.0.46.dev0.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
238
- unstructured_ingest-1.0.46.dev0.dist-info/RECORD,,
234
+ unstructured_ingest-1.0.48.dist-info/METADATA,sha256=mFnSAhVaigxOwDyNbTK1XCVXRwY40tsATd3zYSEQE9k,8842
235
+ unstructured_ingest-1.0.48.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
236
+ unstructured_ingest-1.0.48.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
237
+ unstructured_ingest-1.0.48.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
238
+ unstructured_ingest-1.0.48.dist-info/RECORD,,