unstructured-ingest 1.0.19__tar.gz → 1.0.23__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (239) hide show
  1. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/PKG-INFO +2 -2
  2. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/pyproject.toml +2 -2
  3. unstructured_ingest-1.0.23/unstructured_ingest/__version__.py +1 -0
  4. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/embed/mixedbreadai.py +28 -45
  5. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/jira.py +209 -171
  6. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +22 -3
  7. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +1 -0
  8. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +1 -0
  9. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +1 -0
  10. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +1 -0
  11. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +1 -0
  12. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +1 -0
  13. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +1 -0
  14. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +1 -1
  15. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +1 -0
  16. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +1 -0
  17. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +1 -0
  18. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +1 -0
  19. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +1 -0
  20. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +1 -0
  21. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +1 -0
  22. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +1 -0
  23. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +1 -0
  24. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +1 -0
  25. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +1 -0
  26. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +1 -0
  27. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +1 -0
  28. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/weaviate/weaviate.py +30 -13
  29. unstructured_ingest-1.0.19/unstructured_ingest/__version__.py +0 -1
  30. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/.gitignore +0 -0
  31. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/LICENSE.md +0 -0
  32. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/README.md +0 -0
  33. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/__init__.py +0 -0
  34. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/cli/README.md +0 -0
  35. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/cli/__init__.py +0 -0
  36. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/cli/base/__init__.py +0 -0
  37. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/cli/base/cmd.py +0 -0
  38. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/cli/base/dest.py +0 -0
  39. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/cli/base/importer.py +0 -0
  40. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/cli/base/src.py +0 -0
  41. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/cli/cli.py +0 -0
  42. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/cli/cmds.py +0 -0
  43. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/cli/utils/__init__.py +0 -0
  44. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/cli/utils/click.py +0 -0
  45. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/cli/utils/model_conversion.py +0 -0
  46. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/data_types/__init__.py +0 -0
  47. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/data_types/entities.py +0 -0
  48. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/data_types/file_data.py +0 -0
  49. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/embed/__init__.py +0 -0
  50. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/embed/azure_openai.py +0 -0
  51. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/embed/bedrock.py +0 -0
  52. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/embed/huggingface.py +0 -0
  53. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/embed/interfaces.py +0 -0
  54. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/embed/octoai.py +0 -0
  55. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/embed/openai.py +0 -0
  56. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/embed/togetherai.py +0 -0
  57. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/embed/vertexai.py +0 -0
  58. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/embed/voyageai.py +0 -0
  59. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/error.py +0 -0
  60. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/errors_v2.py +0 -0
  61. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/interfaces/__init__.py +0 -0
  62. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/interfaces/connector.py +0 -0
  63. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/interfaces/downloader.py +0 -0
  64. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/interfaces/indexer.py +0 -0
  65. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/interfaces/process.py +0 -0
  66. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/interfaces/processor.py +0 -0
  67. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/interfaces/upload_stager.py +0 -0
  68. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/interfaces/uploader.py +0 -0
  69. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/logger.py +0 -0
  70. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/main.py +0 -0
  71. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/otel.py +0 -0
  72. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/pipeline/__init__.py +0 -0
  73. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/pipeline/interfaces.py +0 -0
  74. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/pipeline/otel.py +0 -0
  75. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/pipeline/pipeline.py +0 -0
  76. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/pipeline/steps/__init__.py +0 -0
  77. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/pipeline/steps/chunk.py +0 -0
  78. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/pipeline/steps/download.py +0 -0
  79. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/pipeline/steps/embed.py +0 -0
  80. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/pipeline/steps/filter.py +0 -0
  81. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/pipeline/steps/index.py +0 -0
  82. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/pipeline/steps/partition.py +0 -0
  83. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/pipeline/steps/stage.py +0 -0
  84. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/pipeline/steps/uncompress.py +0 -0
  85. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/pipeline/steps/upload.py +0 -0
  86. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/__init__.py +0 -0
  87. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/chunker.py +0 -0
  88. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connector_registry.py +0 -0
  89. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/__init__.py +0 -0
  90. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/airtable.py +0 -0
  91. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  92. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +0 -0
  93. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +0 -0
  94. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/astradb.py +0 -0
  95. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/azure_ai_search.py +0 -0
  96. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/chroma.py +0 -0
  97. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/confluence.py +0 -0
  98. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/couchbase.py +0 -0
  99. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/databricks/__init__.py +0 -0
  100. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/databricks/volumes.py +0 -0
  101. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/databricks/volumes_aws.py +0 -0
  102. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/databricks/volumes_azure.py +0 -0
  103. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +0 -0
  104. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/databricks/volumes_native.py +0 -0
  105. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/databricks/volumes_table.py +0 -0
  106. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/delta_table.py +0 -0
  107. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/discord.py +0 -0
  108. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/duckdb/__init__.py +0 -0
  109. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/duckdb/base.py +0 -0
  110. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/duckdb/duckdb.py +0 -0
  111. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/duckdb/motherduck.py +0 -0
  112. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/elasticsearch/__init__.py +0 -0
  113. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +0 -0
  114. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +0 -0
  115. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/fsspec/__init__.py +0 -0
  116. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/fsspec/azure.py +0 -0
  117. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/fsspec/box.py +0 -0
  118. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/fsspec/dropbox.py +0 -0
  119. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/fsspec/fsspec.py +0 -0
  120. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/fsspec/gcs.py +0 -0
  121. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/fsspec/s3.py +0 -0
  122. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/fsspec/sftp.py +0 -0
  123. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/fsspec/utils.py +0 -0
  124. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/github.py +0 -0
  125. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/gitlab.py +0 -0
  126. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/google_drive.py +0 -0
  127. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +0 -0
  128. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +0 -0
  129. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/kafka/__init__.py +0 -0
  130. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/kafka/cloud.py +0 -0
  131. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/kafka/kafka.py +0 -0
  132. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/kafka/local.py +0 -0
  133. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/kdbai.py +0 -0
  134. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/lancedb/__init__.py +0 -0
  135. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/lancedb/aws.py +0 -0
  136. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/lancedb/azure.py +0 -0
  137. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/lancedb/cloud.py +0 -0
  138. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/lancedb/gcp.py +0 -0
  139. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/lancedb/lancedb.py +0 -0
  140. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/lancedb/local.py +0 -0
  141. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/local.py +0 -0
  142. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/milvus.py +0 -0
  143. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/mongodb.py +0 -0
  144. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/neo4j.py +0 -0
  145. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  146. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/client.py +0 -0
  147. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/connector.py +0 -0
  148. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/helpers.py +0 -0
  149. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +0 -0
  150. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +0 -0
  151. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +0 -0
  152. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +0 -0
  153. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/interfaces.py +0 -0
  154. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  155. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/block.py +0 -0
  156. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +0 -0
  157. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +0 -0
  158. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +0 -0
  159. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -0
  160. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +0 -0
  161. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +0 -0
  162. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +0 -0
  163. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/blocks/code.py +0 -0
  164. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +0 -0
  165. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +0 -0
  166. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +0 -0
  167. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +0 -0
  168. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/blocks/file.py +0 -0
  169. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +0 -0
  170. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/blocks/image.py +0 -0
  171. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +0 -0
  172. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +0 -0
  173. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +0 -0
  174. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +0 -0
  175. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +0 -0
  176. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +0 -0
  177. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +0 -0
  178. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/blocks/table.py +0 -0
  179. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +0 -0
  180. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/blocks/template.py +0 -0
  181. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +0 -0
  182. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +0 -0
  183. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +0 -0
  184. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/blocks/video.py +0 -0
  185. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/database.py +0 -0
  186. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +1 -1
  187. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/date.py +0 -0
  188. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/file.py +0 -0
  189. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/page.py +0 -0
  190. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/parent.py +0 -0
  191. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/rich_text.py +0 -0
  192. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/notion/types/user.py +0 -0
  193. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/onedrive.py +0 -0
  194. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/outlook.py +0 -0
  195. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/pinecone.py +0 -0
  196. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/qdrant/__init__.py +0 -0
  197. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/qdrant/cloud.py +0 -0
  198. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/qdrant/local.py +0 -0
  199. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/qdrant/qdrant.py +0 -0
  200. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/qdrant/server.py +0 -0
  201. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/redisdb.py +0 -0
  202. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/salesforce.py +0 -0
  203. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/sharepoint.py +0 -0
  204. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/slack.py +0 -0
  205. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/sql/__init__.py +0 -0
  206. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +0 -0
  207. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/sql/postgres.py +0 -0
  208. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/sql/singlestore.py +0 -0
  209. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/sql/snowflake.py +0 -0
  210. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/sql/sql.py +0 -0
  211. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/sql/sqlite.py +0 -0
  212. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/sql/vastdb.py +0 -0
  213. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/utils.py +0 -0
  214. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/vectara.py +0 -0
  215. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/weaviate/__init__.py +0 -0
  216. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/weaviate/cloud.py +0 -0
  217. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/weaviate/embedded.py +0 -0
  218. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/weaviate/local.py +0 -0
  219. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  220. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/zendesk/client.py +0 -0
  221. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -0
  222. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/embedder.py +0 -0
  223. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/filter.py +0 -0
  224. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/partitioner.py +0 -0
  225. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/uncompress.py +0 -0
  226. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/utils/__init__.py +0 -0
  227. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/processes/utils/blob_storage.py +0 -0
  228. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/unstructured_api.py +0 -0
  229. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/utils/__init__.py +0 -0
  230. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/utils/chunking.py +0 -0
  231. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/utils/compression.py +0 -0
  232. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/utils/constants.py +0 -0
  233. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/utils/data_prep.py +0 -0
  234. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/utils/dep_check.py +0 -0
  235. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/utils/html.py +0 -0
  236. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/utils/ndjson.py +0 -0
  237. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/utils/pydantic_models.py +0 -0
  238. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/utils/string_and_date_utils.py +0 -0
  239. {unstructured_ingest-1.0.19 → unstructured_ingest-1.0.23}/unstructured_ingest/utils/table.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: unstructured_ingest
3
- Version: 1.0.19
3
+ Version: 1.0.23
4
4
  Summary: Local ETL data pipeline to get data RAG ready
5
5
  Author-email: Unstructured Technologies <devops@unstructuredai.io>
6
6
  License-Expression: Apache-2.0
@@ -114,7 +114,7 @@ Requires-Dist: unstructured[md]; extra == 'md'
114
114
  Provides-Extra: milvus
115
115
  Requires-Dist: pymilvus; extra == 'milvus'
116
116
  Provides-Extra: mixedbreadai
117
- Requires-Dist: mixedbread-ai; extra == 'mixedbreadai'
117
+ Requires-Dist: mixedbread; extra == 'mixedbreadai'
118
118
  Provides-Extra: mongodb
119
119
  Requires-Dist: pymongo; extra == 'mongodb'
120
120
  Provides-Extra: msg
@@ -136,14 +136,14 @@ test = [
136
136
  "deepdiff",
137
137
  "bs4",
138
138
  "pandas",
139
-
140
139
  # Connector specific deps
141
140
  "cryptography",
142
141
  "fsspec",
143
142
  "vertexai",
144
143
  "pyiceberg",
145
144
  "pyarrow",
146
- "networkx"
145
+ "networkx",
146
+ "htmlbuilder",
147
147
  ]
148
148
  # Add constraints needed for CI
149
149
  ci = [
@@ -0,0 +1 @@
1
+ __version__ = "1.0.23" # pragma: no cover
@@ -19,8 +19,7 @@ TRUNCATION_STRATEGY = "end"
19
19
 
20
20
 
21
21
  if TYPE_CHECKING:
22
- from mixedbread_ai.client import AsyncMixedbreadAI, MixedbreadAI
23
- from mixedbread_ai.core import RequestOptions
22
+ from mixedbread import AsyncMixedbread, Mixedbread
24
23
 
25
24
 
26
25
  class MixedbreadAIEmbeddingConfig(EmbeddingConfig):
@@ -44,31 +43,33 @@ class MixedbreadAIEmbeddingConfig(EmbeddingConfig):
44
43
  )
45
44
 
46
45
  @requires_dependencies(
47
- ["mixedbread_ai"],
48
- extras="mixedbreadai",
46
+ ["mixedbread"],
47
+ extras="embed-mixedbreadai",
49
48
  )
50
- def get_client(self) -> "MixedbreadAI":
49
+ def get_client(self) -> "Mixedbread":
51
50
  """
52
51
  Create the Mixedbread AI client.
53
52
 
54
53
  Returns:
55
- MixedbreadAI: Initialized client.
54
+ Mixedbread: Initialized client.
56
55
  """
57
- from mixedbread_ai.client import MixedbreadAI
56
+ from mixedbread import Mixedbread
58
57
 
59
- return MixedbreadAI(
58
+ return Mixedbread(
60
59
  api_key=self.api_key.get_secret_value(),
60
+ max_retries=MAX_RETRIES,
61
61
  )
62
62
 
63
63
  @requires_dependencies(
64
- ["mixedbread_ai"],
65
- extras="mixedbreadai",
64
+ ["mixedbread"],
65
+ extras="embed-mixedbreadai",
66
66
  )
67
- def get_async_client(self) -> "AsyncMixedbreadAI":
68
- from mixedbread_ai.client import AsyncMixedbreadAI
67
+ def get_async_client(self) -> "AsyncMixedbread":
68
+ from mixedbread import AsyncMixedbread
69
69
 
70
- return AsyncMixedbreadAI(
70
+ return AsyncMixedbread(
71
71
  api_key=self.api_key.get_secret_value(),
72
+ max_retries=MAX_RETRIES,
72
73
  )
73
74
 
74
75
 
@@ -88,29 +89,20 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
88
89
  return self.embed_query(query="Q")
89
90
 
90
91
  @requires_dependencies(
91
- ["mixedbread_ai"],
92
+ ["mixedbread"],
92
93
  extras="embed-mixedbreadai",
93
94
  )
94
- def get_request_options(self) -> "RequestOptions":
95
- from mixedbread_ai.core import RequestOptions
96
-
97
- return RequestOptions(
98
- max_retries=MAX_RETRIES,
99
- timeout_in_seconds=TIMEOUT,
100
- additional_headers={"User-Agent": USER_AGENT},
101
- )
102
-
103
- def get_client(self) -> "MixedbreadAI":
95
+ def get_client(self) -> "Mixedbread":
104
96
  return self.config.get_client()
105
97
 
106
- def embed_batch(self, client: "MixedbreadAI", batch: list[str]) -> list[list[float]]:
107
- response = client.embeddings(
98
+ def embed_batch(self, client: "Mixedbread", batch: list[str]) -> list[list[float]]:
99
+ response = client.embed(
108
100
  model=self.config.embedder_model_name,
101
+ input=batch,
109
102
  normalized=True,
110
103
  encoding_format=ENCODING_FORMAT,
111
- truncation_strategy=TRUNCATION_STRATEGY,
112
- request_options=self.get_request_options(),
113
- input=batch,
104
+ extra_headers={"User-Agent": USER_AGENT},
105
+ timeout=TIMEOUT,
114
106
  )
115
107
  return [datum.embedding for datum in response.data]
116
108
 
@@ -124,28 +116,19 @@ class AsyncMixedbreadAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
124
116
  return await self.embed_query(query="Q")
125
117
 
126
118
  @requires_dependencies(
127
- ["mixedbread_ai"],
119
+ ["mixedbread"],
128
120
  extras="embed-mixedbreadai",
129
121
  )
130
- def get_request_options(self) -> "RequestOptions":
131
- from mixedbread_ai.core import RequestOptions
132
-
133
- return RequestOptions(
134
- max_retries=MAX_RETRIES,
135
- timeout_in_seconds=TIMEOUT,
136
- additional_headers={"User-Agent": USER_AGENT},
137
- )
138
-
139
- def get_client(self) -> "AsyncMixedbreadAI":
122
+ def get_client(self) -> "AsyncMixedbread":
140
123
  return self.config.get_async_client()
141
124
 
142
- async def embed_batch(self, client: "AsyncMixedbreadAI", batch: list[str]) -> list[list[float]]:
143
- response = await client.embeddings(
125
+ async def embed_batch(self, client: "AsyncMixedbread", batch: list[str]) -> list[list[float]]:
126
+ response = await client.embed(
144
127
  model=self.config.embedder_model_name,
128
+ input=batch,
145
129
  normalized=True,
146
130
  encoding_format=ENCODING_FORMAT,
147
- truncation_strategy=TRUNCATION_STRATEGY,
148
- request_options=self.get_request_options(),
149
- input=batch,
131
+ extra_headers={"User-Agent": USER_AGENT},
132
+ timeout=TIMEOUT,
150
133
  )
151
134
  return [datum.embedding for datum in response.data]
@@ -1,11 +1,11 @@
1
- import math
2
1
  from collections import abc
3
2
  from contextlib import contextmanager
4
3
  from dataclasses import dataclass, field
5
4
  from pathlib import Path
6
- from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional, Union
5
+ from time import time
6
+ from typing import TYPE_CHECKING, Any, Callable, Generator, List, Optional, Union, cast
7
7
 
8
- from pydantic import Field, Secret
8
+ from pydantic import BaseModel, Field, Secret
9
9
 
10
10
  from unstructured_ingest.data_types.file_data import (
11
11
  FileData,
@@ -21,6 +21,7 @@ from unstructured_ingest.interfaces import (
21
21
  DownloadResponse,
22
22
  Indexer,
23
23
  IndexerConfig,
24
+ download_responses,
24
25
  )
25
26
  from unstructured_ingest.logger import logger
26
27
  from unstructured_ingest.processes.connector_registry import (
@@ -37,24 +38,13 @@ DEFAULT_C_SEP = " " * 5
37
38
  DEFAULT_R_SEP = "\n"
38
39
 
39
40
 
40
- @dataclass
41
- class JiraIssueMetadata:
41
+ class JiraIssueMetadata(BaseModel):
42
42
  id: str
43
43
  key: str
44
- board_id: Optional[str] = None
45
44
 
46
- @property
47
- def project_id(self) -> str:
45
+ def get_project_id(self) -> str:
48
46
  return self.key.split("-")[0]
49
47
 
50
- def to_dict(self) -> Dict[str, Union[str, None]]:
51
- return {
52
- "id": self.id,
53
- "key": self.key,
54
- "board_id": self.board_id,
55
- "project_id": self.project_id,
56
- }
57
-
58
48
 
59
49
  class FieldGetter(dict):
60
50
  def __getitem__(self, key):
@@ -77,52 +67,32 @@ def nested_object_to_field_getter(obj: dict) -> Union[FieldGetter, dict]:
77
67
  return obj
78
68
 
79
69
 
80
- def issues_fetcher_wrapper(func, results_key="results", number_of_issues_to_fetch: int = 100):
81
- """
82
- A decorator function that wraps around a function to fetch issues from Jira API in a paginated
83
- manner. This is required because the Jira API has a limit of 100 issues per request.
84
-
85
- Args:
86
- func (callable): The function to be wrapped. This function should accept `limit` and `start`
87
- as keyword arguments.
88
- results_key (str, optional): The key in the response dictionary that contains the list of
89
- results. Defaults to "results".
90
- number_of_issues_to_fetch (int, optional): The total number of issues to fetch. Defaults to
91
- 100.
92
-
93
- Returns:
94
- list: A list of all fetched issues.
95
-
96
- Raises:
97
- KeyError: If the response dictionary does not contain the specified `results_key`.
98
- TypeError: If the response type from the Jira API is neither list nor dict.
99
- """
100
-
101
- def wrapper(*args, **kwargs) -> list:
102
- kwargs["limit"] = min(100, number_of_issues_to_fetch)
103
- kwargs["start"] = kwargs.get("start", 0)
104
-
105
- all_results = []
106
- num_iterations = math.ceil(number_of_issues_to_fetch / kwargs["limit"])
107
-
108
- for _ in range(num_iterations):
109
- response = func(*args, **kwargs)
110
- if isinstance(response, list):
111
- all_results += response
112
- elif isinstance(response, dict):
113
- if results_key not in response:
114
- raise KeyError(f'Response object is missing "{results_key}" key.')
115
- all_results += response[results_key]
116
- else:
117
- raise TypeError(
118
- f"""Unexpected response type from Jira API.
119
- Response type has to be either list or dict, got: {type(response).__name__}."""
120
- )
121
- kwargs["start"] += kwargs["limit"]
122
-
123
- return all_results
124
-
125
- return wrapper
70
+ def api_token_based_generator(
71
+ fn: Callable, key: str = "issues", **kwargs
72
+ ) -> Generator[dict, None, None]:
73
+ nextPageToken = kwargs.pop("nextPageToken", None)
74
+ while True:
75
+ resp = fn(nextPageToken=nextPageToken, **kwargs)
76
+ issues = resp.get(key, [])
77
+ for issue in issues:
78
+ yield issue
79
+ nextPageToken = resp.get("nextPageToken")
80
+ if not nextPageToken:
81
+ break
82
+
83
+
84
+ def api_page_based_generator(
85
+ fn: Callable, key: str = "issues", **kwargs
86
+ ) -> Generator[dict, None, None]:
87
+ start = kwargs.pop("start", 0)
88
+ while True:
89
+ resp = fn(start=start, **kwargs)
90
+ issues = resp.get(key, [])
91
+ if not issues:
92
+ break
93
+ for issue in issues:
94
+ yield issue
95
+ start += len(issues)
126
96
 
127
97
 
128
98
  class JiraAccessConfig(AccessConfig):
@@ -169,8 +139,28 @@ class JiraConnectionConfig(ConnectionConfig):
169
139
  def get_client(self) -> Generator["Jira", None, None]:
170
140
  from atlassian import Jira
171
141
 
142
+ class CustomJira(Jira):
143
+ """
144
+ Custom Jira class to fix the issue with the get_project_issues_count method.
145
+ This class inherits from the original Jira class and overrides the method to
146
+ handle the response correctly.
147
+ Once the issue is fixed in the original library, this class can be removed.
148
+ """
149
+
150
+ def __init__(self, *args, **kwargs):
151
+ super().__init__(*args, **kwargs)
152
+
153
+ def get_project_issues_count(self, project: str) -> int:
154
+ jql = f'project = "{project}" '
155
+ response = self.jql(jql, fields="*none")
156
+ response = cast("dict", response)
157
+ if "total" in response:
158
+ return response["total"]
159
+ else:
160
+ return len(response["issues"])
161
+
172
162
  access_configs = self.access_config.get_secret_value()
173
- with Jira(
163
+ with CustomJira(
174
164
  url=self.url,
175
165
  username=self.username,
176
166
  password=access_configs.password,
@@ -181,9 +171,17 @@ class JiraConnectionConfig(ConnectionConfig):
181
171
 
182
172
 
183
173
  class JiraIndexerConfig(IndexerConfig):
184
- projects: Optional[List[str]] = Field(None, description="List of project keys")
185
- boards: Optional[List[str]] = Field(None, description="List of board IDs")
186
- issues: Optional[List[str]] = Field(None, description="List of issue keys or IDs")
174
+ projects: Optional[list[str]] = Field(None, description="List of project keys")
175
+ boards: Optional[list[str]] = Field(None, description="List of board IDs")
176
+ issues: Optional[list[str]] = Field(None, description="List of issue keys or IDs")
177
+ status_filters: Optional[list[str]] = Field(
178
+ default=None,
179
+ description="List of status filters, if provided will only return issues that have these statuses", # noqa: E501
180
+ )
181
+
182
+ def model_post_init(self, context: Any, /) -> None:
183
+ if not self.projects and not self.boards and not self.issues:
184
+ raise ValueError("At least one of projects, boards, or issues must be provided.")
187
185
 
188
186
 
189
187
  @dataclass
@@ -208,122 +206,103 @@ class JiraIndexer(Indexer):
208
206
  )
209
207
  logger.info("Connection to Jira successful.")
210
208
 
211
- def _get_issues_within_single_project(self, project_key: str) -> List[JiraIssueMetadata]:
209
+ def _get_issues_within_projects(self) -> Generator[JiraIssueMetadata, None, None]:
212
210
  with self.connection_config.get_client() as client:
213
- number_of_issues_to_fetch = client.get_project_issues_count(project=project_key)
214
- if isinstance(number_of_issues_to_fetch, dict):
215
- if "total" not in number_of_issues_to_fetch:
216
- raise KeyError('Response object is missing "total" key.')
217
- number_of_issues_to_fetch = number_of_issues_to_fetch["total"]
218
- if not number_of_issues_to_fetch:
219
- logger.warning(f"No issues found in project: {project_key}. Skipping!")
220
- return []
221
- get_project_issues = issues_fetcher_wrapper(
222
- client.get_all_project_issues,
223
- results_key="issues",
224
- number_of_issues_to_fetch=number_of_issues_to_fetch,
225
- )
226
- issues = get_project_issues(project=project_key, fields=["key", "id"])
227
- logger.debug(f"Found {len(issues)} issues in project: {project_key}")
228
- return [JiraIssueMetadata(id=issue["id"], key=issue["key"]) for issue in issues]
229
-
230
- def _get_issues_within_projects(self) -> List[JiraIssueMetadata]:
231
- project_keys = self.index_config.projects
232
- if not project_keys:
233
- # for when a component list is provided, without any projects
234
- if self.index_config.boards or self.index_config.issues:
235
- return []
236
- # for when no components are provided. all projects will be ingested
237
- else:
238
- with self.connection_config.get_client() as client:
239
- project_keys = [project["key"] for project in client.projects()]
240
- return [
241
- issue
242
- for project_key in project_keys
243
- for issue in self._get_issues_within_single_project(project_key)
244
- ]
211
+ fields = ["key", "id"]
212
+ jql = "project in ({})".format(", ".join(self.index_config.projects))
213
+ jql = self._update_jql(jql)
214
+ for issue in api_token_based_generator(client.enhanced_jql, jql=jql, fields=fields):
215
+ yield JiraIssueMetadata.model_validate(issue)
245
216
 
246
217
  def _get_issues_within_single_board(self, board_id: str) -> List[JiraIssueMetadata]:
247
218
  with self.connection_config.get_client() as client:
248
- get_board_issues = issues_fetcher_wrapper(
249
- client.get_issues_for_board,
250
- results_key="issues",
251
- )
252
- issues = get_board_issues(board_id=board_id, fields=["key", "id"], jql=None)
253
- logger.debug(f"Found {len(issues)} issues in board: {board_id}")
254
- return [
255
- JiraIssueMetadata(id=issue["id"], key=issue["key"], board_id=board_id)
256
- for issue in issues
257
- ]
258
-
259
- def _get_issues_within_boards(self) -> List[JiraIssueMetadata]:
219
+ fields = ["key", "id"]
220
+ if self.index_config.status_filters:
221
+ jql = "status in ({}) ORDER BY id".format(
222
+ ", ".join([f'"{s}"' for s in self.index_config.status_filters])
223
+ )
224
+ else:
225
+ jql = "ORDER BY id"
226
+ for issue in api_page_based_generator(
227
+ fn=client.get_issues_for_board, board_id=board_id, fields=fields, jql=jql
228
+ ):
229
+ yield JiraIssueMetadata.model_validate(issue)
230
+
231
+ def _get_issues_within_boards(self) -> Generator[JiraIssueMetadata, None, None]:
260
232
  if not self.index_config.boards:
261
- return []
262
- return [
263
- issue
264
- for board_id in self.index_config.boards
265
- for issue in self._get_issues_within_single_board(board_id)
266
- ]
267
-
268
- def _get_issues(self) -> List[JiraIssueMetadata]:
269
- with self.connection_config.get_client() as client:
270
- issues = [
271
- client.get_issue(issue_id_or_key=issue_key, fields=["key", "id"])
272
- for issue_key in self.index_config.issues or []
273
- ]
274
- return [JiraIssueMetadata(id=issue["id"], key=issue["key"]) for issue in issues]
275
-
276
- def get_issues(self) -> List[JiraIssueMetadata]:
277
- issues = [
278
- *self._get_issues_within_boards(),
279
- *self._get_issues_within_projects(),
280
- *self._get_issues(),
281
- ]
282
- # Select unique issues by issue 'id'.
283
- # Since boards issues are fetched first,
284
- # if there are duplicates, the board issues will be kept,
285
- # in order to keep issue 'board_id' information.
286
- seen = set()
287
- unique_issues: List[JiraIssueMetadata] = []
288
- for issue in issues:
289
- if issue.id not in seen:
290
- unique_issues.append(issue)
291
- seen.add(issue.id)
292
- return unique_issues
233
+ yield
234
+ for board_id in self.index_config.boards:
235
+ for issue in self._get_issues_within_single_board(board_id=board_id):
236
+ yield issue
237
+
238
+ def _update_jql(self, jql: str) -> str:
239
+ if self.index_config.status_filters:
240
+ jql += " and status in ({})".format(
241
+ ", ".join([f'"{s}"' for s in self.index_config.status_filters])
242
+ )
243
+ jql = jql + " ORDER BY id"
244
+ return jql
293
245
 
294
- def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
295
- from time import time
246
+ def _get_issues_by_keys(self) -> Generator[JiraIssueMetadata, None, None]:
247
+ with self.connection_config.get_client() as client:
248
+ fields = ["key", "id"]
249
+ jql = "key in ({})".format(", ".join(self.index_config.issues))
250
+ jql = self._update_jql(jql)
251
+ for issue in api_token_based_generator(client.enhanced_jql, jql=jql, fields=fields):
252
+ yield JiraIssueMetadata.model_validate(issue)
253
+
254
+ def _create_file_data_from_issue(self, issue: JiraIssueMetadata) -> FileData:
255
+ # Build metadata
256
+ metadata = FileDataSourceMetadata(
257
+ date_processed=str(time()),
258
+ record_locator=issue.model_dump(),
259
+ )
296
260
 
297
- issues = self.get_issues()
298
- for issue in issues:
299
- # Build metadata
300
- metadata = FileDataSourceMetadata(
301
- date_processed=str(time()),
302
- record_locator=issue.to_dict(),
303
- )
261
+ # Construct relative path and filename
262
+ filename = f"{issue.id}.txt"
263
+ relative_path = str(Path(issue.get_project_id()) / filename)
304
264
 
305
- # Construct relative path and filename
306
- filename = f"{issue.id}.txt"
307
- relative_path = str(Path(issue.project_id) / filename)
265
+ source_identifiers = SourceIdentifiers(
266
+ filename=filename,
267
+ fullpath=relative_path,
268
+ rel_path=relative_path,
269
+ )
308
270
 
309
- source_identifiers = SourceIdentifiers(
310
- filename=filename,
311
- fullpath=relative_path,
312
- rel_path=relative_path,
313
- )
271
+ file_data = FileData(
272
+ identifier=issue.id,
273
+ connector_type=self.connector_type,
274
+ metadata=metadata,
275
+ additional_metadata=issue.model_dump(),
276
+ source_identifiers=source_identifiers,
277
+ )
278
+ return file_data
279
+
280
+ def get_generators(self) -> List[Callable]:
281
+ generators = []
282
+ if self.index_config.boards:
283
+ generators.append(self._get_issues_within_boards)
284
+ if self.index_config.issues:
285
+ generators.append(self._get_issues_by_keys)
286
+ if self.index_config.projects:
287
+ generators.append(self._get_issues_within_projects)
288
+ return generators
314
289
 
315
- file_data = FileData(
316
- identifier=issue.id,
317
- connector_type=self.connector_type,
318
- metadata=metadata,
319
- additional_metadata=issue.to_dict(),
320
- source_identifiers=source_identifiers,
321
- )
322
- yield file_data
290
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
291
+ seen_keys = []
292
+ for gen in self.get_generators():
293
+ for issue in gen():
294
+ if not issue:
295
+ continue
296
+ if issue.key in seen_keys:
297
+ continue
298
+ seen_keys.append(issue.key)
299
+ yield self._create_file_data_from_issue(issue=issue)
323
300
 
324
301
 
325
302
  class JiraDownloaderConfig(DownloaderConfig):
326
- pass
303
+ download_attachments: bool = Field(
304
+ default=False, description="If True, will download any attachments and process as well"
305
+ )
327
306
 
328
307
 
329
308
  @dataclass
@@ -428,7 +407,56 @@ class JiraDownloader(Downloader):
428
407
  logger.error(f"Failed to fetch issue with key: {issue_key}: {e}", exc_info=True)
429
408
  raise SourceConnectionError(f"Failed to fetch issue with key: {issue_key}: {e}")
430
409
 
431
- def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
410
+ def generate_attachment_file_data(
411
+ self, attachment_dict: dict, parent_filedata: FileData
412
+ ) -> FileData:
413
+ new_filedata = parent_filedata.model_copy(deep=True)
414
+ if new_filedata.metadata.record_locator is None:
415
+ new_filedata.metadata.record_locator = {}
416
+ new_filedata.metadata.record_locator["parent_issue"] = (
417
+ parent_filedata.metadata.record_locator["id"]
418
+ )
419
+ # Append an identifier for attachment to not conflict with issue ids
420
+ new_filedata.identifier = "{}a".format(attachment_dict["id"])
421
+ filename = attachment_dict["filename"]
422
+ new_filedata.metadata.filesize_bytes = attachment_dict.pop("size", None)
423
+ new_filedata.metadata.date_created = attachment_dict.pop("created", None)
424
+ new_filedata.metadata.url = attachment_dict.pop("self", None)
425
+ new_filedata.metadata.record_locator = attachment_dict
426
+ new_filedata.source_identifiers = SourceIdentifiers(
427
+ filename=filename,
428
+ fullpath=(Path(str(attachment_dict["id"])) / Path(filename)).as_posix(),
429
+ )
430
+ return new_filedata
431
+
432
+ def process_attachments(
433
+ self, file_data: FileData, attachments: list[dict]
434
+ ) -> list[DownloadResponse]:
435
+ with self.connection_config.get_client() as client:
436
+ download_path = self.get_download_path(file_data)
437
+ attachment_download_dir = download_path.parent / "attachments"
438
+ attachment_download_dir.mkdir(parents=True, exist_ok=True)
439
+ download_responses = []
440
+ for attachment in attachments:
441
+ attachment_filename = Path(attachment["filename"])
442
+ attachment_id = attachment["id"]
443
+ attachment_download_path = attachment_download_dir / Path(
444
+ attachment_id
445
+ ).with_suffix(attachment_filename.suffix)
446
+ resp = client.get_attachment_content(attachment_id=attachment_id)
447
+ with open(attachment_download_path, "wb") as f:
448
+ f.write(resp)
449
+ attachment_filedata = self.generate_attachment_file_data(
450
+ attachment_dict=attachment, parent_filedata=file_data
451
+ )
452
+ download_responses.append(
453
+ self.generate_download_response(
454
+ file_data=attachment_filedata, download_path=attachment_download_path
455
+ )
456
+ )
457
+ return download_responses
458
+
459
+ def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
432
460
  issue_key = file_data.additional_metadata.get("key")
433
461
  if not issue_key:
434
462
  raise ValueError("Issue key not found in metadata.")
@@ -443,7 +471,17 @@ class JiraDownloader(Downloader):
443
471
  with open(download_path, "w") as f:
444
472
  f.write(issue_str)
445
473
  self.update_file_data(file_data, issue)
446
- return self.generate_download_response(file_data=file_data, download_path=download_path)
474
+ download_response = self.generate_download_response(
475
+ file_data=file_data, download_path=download_path
476
+ )
477
+ if self.download_config.download_attachments and (
478
+ attachments := issue.get("fields", {}).get("attachment")
479
+ ):
480
+ attachment_responses = self.process_attachments(
481
+ file_data=file_data, attachments=attachments
482
+ )
483
+ download_response = [download_response] + attachment_responses
484
+ return download_response
447
485
 
448
486
 
449
487
  jira_source_entry = SourceRegistryEntry(