unstructured-ingest 1.0.37__tar.gz → 1.0.40__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (239) hide show
  1. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/PKG-INFO +1 -1
  2. unstructured_ingest-1.0.40/unstructured_ingest/__version__.py +1 -0
  3. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/confluence.py +20 -1
  4. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/milvus.py +81 -7
  5. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +8 -9
  6. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/pinecone.py +1 -1
  7. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/utils/html.py +14 -1
  8. unstructured_ingest-1.0.37/unstructured_ingest/__version__.py +0 -1
  9. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/.gitignore +0 -0
  10. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/LICENSE.md +0 -0
  11. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/README.md +0 -0
  12. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/pyproject.toml +0 -0
  13. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/__init__.py +0 -0
  14. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/README.md +0 -0
  15. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/__init__.py +0 -0
  16. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/base/__init__.py +0 -0
  17. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/base/cmd.py +0 -0
  18. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/base/dest.py +0 -0
  19. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/base/importer.py +0 -0
  20. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/base/src.py +0 -0
  21. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/cli.py +0 -0
  22. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/cmds.py +0 -0
  23. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/utils/__init__.py +0 -0
  24. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/utils/click.py +0 -0
  25. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/utils/model_conversion.py +0 -0
  26. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/data_types/__init__.py +0 -0
  27. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/data_types/entities.py +0 -0
  28. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/data_types/file_data.py +0 -0
  29. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/__init__.py +0 -0
  30. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/azure_openai.py +0 -0
  31. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/bedrock.py +0 -0
  32. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/huggingface.py +0 -0
  33. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/interfaces.py +0 -0
  34. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/mixedbreadai.py +0 -0
  35. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/octoai.py +0 -0
  36. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/openai.py +0 -0
  37. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/togetherai.py +0 -0
  38. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/vertexai.py +0 -0
  39. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/voyageai.py +0 -0
  40. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/error.py +0 -0
  41. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/errors_v2.py +0 -0
  42. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/interfaces/__init__.py +0 -0
  43. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/interfaces/connector.py +0 -0
  44. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/interfaces/downloader.py +0 -0
  45. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/interfaces/indexer.py +0 -0
  46. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/interfaces/process.py +0 -0
  47. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/interfaces/processor.py +0 -0
  48. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/interfaces/upload_stager.py +0 -0
  49. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/interfaces/uploader.py +0 -0
  50. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/logger.py +0 -0
  51. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/main.py +0 -0
  52. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/otel.py +0 -0
  53. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/__init__.py +0 -0
  54. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/interfaces.py +0 -0
  55. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/otel.py +0 -0
  56. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/pipeline.py +0 -0
  57. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/steps/__init__.py +0 -0
  58. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/steps/chunk.py +0 -0
  59. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/steps/download.py +0 -0
  60. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/steps/embed.py +0 -0
  61. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/steps/filter.py +0 -0
  62. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/steps/index.py +0 -0
  63. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/steps/partition.py +0 -0
  64. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/steps/stage.py +0 -0
  65. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/steps/uncompress.py +0 -0
  66. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/steps/upload.py +0 -0
  67. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/__init__.py +0 -0
  68. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/chunker.py +0 -0
  69. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connector_registry.py +0 -0
  70. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/__init__.py +0 -0
  71. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/airtable.py +0 -0
  72. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  73. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +0 -0
  74. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +0 -0
  75. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/astradb.py +0 -0
  76. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/azure_ai_search.py +0 -0
  77. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/chroma.py +0 -0
  78. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/couchbase.py +0 -0
  79. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/databricks/__init__.py +0 -0
  80. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/databricks/volumes.py +0 -0
  81. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/databricks/volumes_aws.py +0 -0
  82. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/databricks/volumes_azure.py +0 -0
  83. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +0 -0
  84. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/databricks/volumes_native.py +0 -0
  85. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/databricks/volumes_table.py +0 -0
  86. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/delta_table.py +0 -0
  87. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/discord.py +0 -0
  88. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/duckdb/__init__.py +0 -0
  89. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/duckdb/base.py +0 -0
  90. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/duckdb/duckdb.py +0 -0
  91. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/duckdb/motherduck.py +0 -0
  92. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/elasticsearch/__init__.py +0 -0
  93. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +0 -0
  94. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +0 -0
  95. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/fsspec/__init__.py +0 -0
  96. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/fsspec/azure.py +0 -0
  97. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/fsspec/box.py +0 -0
  98. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/fsspec/dropbox.py +0 -0
  99. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/fsspec/fsspec.py +0 -0
  100. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/fsspec/gcs.py +0 -0
  101. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/fsspec/s3.py +0 -0
  102. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/fsspec/sftp.py +0 -0
  103. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/fsspec/utils.py +0 -0
  104. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/github.py +0 -0
  105. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/gitlab.py +0 -0
  106. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/google_drive.py +0 -0
  107. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +0 -0
  108. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +0 -0
  109. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/jira.py +0 -0
  110. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/kafka/__init__.py +0 -0
  111. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/kafka/cloud.py +0 -0
  112. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/kafka/kafka.py +0 -0
  113. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/kafka/local.py +0 -0
  114. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/kdbai.py +0 -0
  115. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/lancedb/__init__.py +0 -0
  116. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/lancedb/aws.py +0 -0
  117. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/lancedb/azure.py +0 -0
  118. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/lancedb/cloud.py +0 -0
  119. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/lancedb/gcp.py +0 -0
  120. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/lancedb/lancedb.py +0 -0
  121. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/lancedb/local.py +0 -0
  122. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/local.py +0 -0
  123. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/mongodb.py +0 -0
  124. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/neo4j.py +0 -0
  125. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  126. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/client.py +0 -0
  127. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/connector.py +0 -0
  128. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/helpers.py +0 -0
  129. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +0 -0
  130. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +0 -0
  131. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +0 -0
  132. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +0 -0
  133. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/interfaces.py +0 -0
  134. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  135. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/block.py +0 -0
  136. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +0 -0
  137. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +0 -0
  138. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +0 -0
  139. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -0
  140. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +0 -0
  141. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +0 -0
  142. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +0 -0
  143. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/code.py +0 -0
  144. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +0 -0
  145. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +0 -0
  146. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +0 -0
  147. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +0 -0
  148. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/file.py +0 -0
  149. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +0 -0
  150. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/image.py +0 -0
  151. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +0 -0
  152. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +0 -0
  153. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +0 -0
  154. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +0 -0
  155. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +0 -0
  156. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +0 -0
  157. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/table.py +0 -0
  158. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +0 -0
  159. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/template.py +0 -0
  160. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +0 -0
  161. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +0 -0
  162. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +0 -0
  163. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/video.py +0 -0
  164. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database.py +0 -0
  165. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +0 -0
  166. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +0 -0
  167. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +0 -0
  168. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +0 -0
  169. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +0 -0
  170. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +0 -0
  171. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +0 -0
  172. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +0 -0
  173. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -0
  174. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +0 -0
  175. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +0 -0
  176. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +0 -0
  177. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +0 -0
  178. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +0 -0
  179. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +0 -0
  180. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +0 -0
  181. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +0 -0
  182. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +0 -0
  183. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +0 -0
  184. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +0 -0
  185. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +0 -0
  186. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +0 -0
  187. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +0 -0
  188. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/date.py +0 -0
  189. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/file.py +0 -0
  190. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/page.py +0 -0
  191. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/parent.py +0 -0
  192. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/rich_text.py +0 -0
  193. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/user.py +0 -0
  194. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/onedrive.py +0 -0
  195. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/outlook.py +0 -0
  196. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/qdrant/__init__.py +0 -0
  197. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/qdrant/cloud.py +0 -0
  198. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/qdrant/local.py +0 -0
  199. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/qdrant/qdrant.py +0 -0
  200. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/qdrant/server.py +0 -0
  201. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/redisdb.py +0 -0
  202. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/salesforce.py +0 -0
  203. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/sharepoint.py +0 -0
  204. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/slack.py +0 -0
  205. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/sql/__init__.py +0 -0
  206. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +0 -0
  207. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/sql/postgres.py +0 -0
  208. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/sql/singlestore.py +0 -0
  209. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/sql/snowflake.py +0 -0
  210. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/sql/sql.py +0 -0
  211. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/sql/sqlite.py +0 -0
  212. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/sql/vastdb.py +0 -0
  213. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/utils.py +0 -0
  214. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/vectara.py +0 -0
  215. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/weaviate/__init__.py +0 -0
  216. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/weaviate/cloud.py +0 -0
  217. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/weaviate/embedded.py +0 -0
  218. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/weaviate/local.py +0 -0
  219. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/weaviate/weaviate.py +0 -0
  220. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  221. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/zendesk/client.py +0 -0
  222. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -0
  223. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/embedder.py +0 -0
  224. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/filter.py +0 -0
  225. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/partitioner.py +0 -0
  226. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/uncompress.py +0 -0
  227. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/utils/__init__.py +0 -0
  228. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/utils/blob_storage.py +0 -0
  229. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/unstructured_api.py +0 -0
  230. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/utils/__init__.py +0 -0
  231. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/utils/chunking.py +0 -0
  232. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/utils/compression.py +0 -0
  233. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/utils/constants.py +0 -0
  234. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/utils/data_prep.py +0 -0
  235. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/utils/dep_check.py +0 -0
  236. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/utils/ndjson.py +0 -0
  237. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/utils/pydantic_models.py +0 -0
  238. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/utils/string_and_date_utils.py +0 -0
  239. {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.40}/unstructured_ingest/utils/table.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: unstructured_ingest
3
- Version: 1.0.37
3
+ Version: 1.0.40
4
4
  Summary: Local ETL data pipeline to get data RAG ready
5
5
  Author-email: Unstructured Technologies <devops@unstructuredai.io>
6
6
  License-Expression: Apache-2.0
@@ -0,0 +1 @@
1
+ __version__ = "1.0.40" # pragma: no cover
@@ -33,6 +33,8 @@ from unstructured_ingest.utils.string_and_date_utils import fix_unescaped_unicod
33
33
 
34
34
  if TYPE_CHECKING:
35
35
  from atlassian import Confluence
36
+ from bs4 import BeautifulSoup
37
+ from bs4.element import Tag
36
38
 
37
39
  CONNECTOR_TYPE = "confluence"
38
40
 
@@ -235,11 +237,28 @@ class ConfluenceIndexer(Indexer):
235
237
  yield file_data
236
238
 
237
239
 
238
- class ConfluenceDownloaderConfig(DownloaderConfig, HtmlMixin):
240
+ class ConfluenceDownloaderConfig(HtmlMixin, DownloaderConfig):
239
241
  max_num_metadata_permissions: int = Field(
240
242
  250, description="Approximate maximum number of permissions included in metadata"
241
243
  )
242
244
 
245
+ @requires_dependencies(["bs4"])
246
+ def _find_hyperlink_tags(self, html_soup: "BeautifulSoup") -> list["Tag"]:
247
+ from bs4.element import Tag
248
+
249
+ return [
250
+ element
251
+ for element in html_soup.find_all(
252
+ "a",
253
+ attrs={
254
+ "class": "confluence-embedded-file",
255
+ "data-linked-resource-type": "attachment",
256
+ "href": True,
257
+ },
258
+ )
259
+ if isinstance(element, Tag)
260
+ ]
261
+
243
262
 
244
263
  @dataclass
245
264
  class ConfluenceDownloader(Downloader):
@@ -1,7 +1,7 @@
1
1
  import json
2
2
  from contextlib import contextmanager
3
3
  from dataclasses import dataclass, field
4
- from typing import TYPE_CHECKING, Any, Generator, Optional, Union
4
+ from typing import TYPE_CHECKING, Any, Generator, Optional
5
5
 
6
6
  from dateutil import parser
7
7
  from pydantic import Field, Secret
@@ -97,10 +97,16 @@ class MilvusUploadStager(UploadStager):
97
97
 
98
98
  def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
99
99
  working_data = element_dict.copy()
100
- if self.upload_stager_config.flatten_metadata and (
101
- metadata := working_data.pop("metadata", None)
102
- ):
103
- working_data.update(flatten_dict(metadata, keys_to_omit=["data_source_record_locator"]))
100
+
101
+ if self.upload_stager_config.flatten_metadata:
102
+ metadata: dict[str, Any] = working_data.pop("metadata", {})
103
+ flattened_metadata = flatten_dict(
104
+ metadata,
105
+ separator="_",
106
+ flatten_lists=False,
107
+ remove_none=True,
108
+ )
109
+ working_data.update(flattened_metadata)
104
110
 
105
111
  # TODO: milvus sdk doesn't seem to support defaults via the schema yet,
106
112
  # remove once that gets updated
@@ -154,6 +160,23 @@ class MilvusUploader(Uploader):
154
160
  upload_config: MilvusUploaderConfig
155
161
  connector_type: str = CONNECTOR_TYPE
156
162
 
163
+ def has_dynamic_fields_enabled(self) -> bool:
164
+ """Check if the target collection has dynamic fields enabled."""
165
+ try:
166
+ with self.get_client() as client:
167
+ collection_info = client.describe_collection(self.upload_config.collection_name)
168
+
169
+ # Check if dynamic field is enabled
170
+ # The schema info should contain enable_dynamic_field or enableDynamicField
171
+ schema_info = collection_info.get(
172
+ "enable_dynamic_field",
173
+ collection_info.get("enableDynamicField", False),
174
+ )
175
+ return bool(schema_info)
176
+ except Exception as e:
177
+ logger.warning(f"Could not determine if collection has dynamic fields enabled: {e}")
178
+ return False
179
+
157
180
  @DestinationConnectionError.wrap
158
181
  def precheck(self):
159
182
  from pymilvus import MilvusException
@@ -164,6 +187,7 @@ class MilvusUploader(Uploader):
164
187
  raise DestinationConnectionError(
165
188
  f"Collection '{self.upload_config.collection_name}' does not exist"
166
189
  )
190
+
167
191
  except MilvusException as milvus_exception:
168
192
  raise DestinationConnectionError(
169
193
  f"failed to precheck Milvus: {str(milvus_exception.message)}"
@@ -193,16 +217,66 @@ class MilvusUploader(Uploader):
193
217
  )
194
218
 
195
219
  @requires_dependencies(["pymilvus"], extras="milvus")
196
- def insert_results(self, data: Union[dict, list[dict]]):
220
+ def _prepare_data_for_insert(self, data: list[dict]) -> list[dict]:
221
+ """
222
+ Conforms the provided data to the schema of the target Milvus collection.
223
+ - If dynamic fields are enabled, it ensures JSON-stringified fields are decoded.
224
+ - If dynamic fields are disabled, it filters out any fields not present in the schema.
225
+ """
226
+
227
+ dynamic_fields_enabled = self.has_dynamic_fields_enabled()
228
+
229
+ # If dynamic fields are enabled, 'languages' field needs to be a list
230
+ if dynamic_fields_enabled:
231
+ logger.debug("Dynamic fields enabled, ensuring 'languages' field is a list.")
232
+ prepared_data = []
233
+ for item in data:
234
+ new_item = item.copy()
235
+ if "languages" in new_item and isinstance(new_item["languages"], str):
236
+ try:
237
+ new_item["languages"] = json.loads(new_item["languages"])
238
+ except (json.JSONDecodeError, TypeError):
239
+ logger.warning(
240
+ f"Could not JSON decode languages field: {new_item['languages']}. "
241
+ "Leaving as string.",
242
+ )
243
+ prepared_data.append(new_item)
244
+ return prepared_data
245
+
246
+ # If dynamic fields are not enabled, we need to filter out the metadata fields
247
+ # to avoid insertion errors for fields not defined in the schema
248
+ with self.get_client() as client:
249
+ collection_info = client.describe_collection(
250
+ self.upload_config.collection_name,
251
+ )
252
+ schema_fields = {
253
+ field["name"]
254
+ for field in collection_info.get("fields", [])
255
+ if not field.get("auto_id", False)
256
+ }
257
+ # Remove metadata fields that are not part of the base schema
258
+ filtered_data = []
259
+ for item in data:
260
+ filtered_item = {key: value for key, value in item.items() if key in schema_fields}
261
+ filtered_data.append(filtered_item)
262
+ return filtered_data
263
+
264
+ @requires_dependencies(["pymilvus"], extras="milvus")
265
+ def insert_results(self, data: list[dict]):
197
266
  from pymilvus import MilvusException
198
267
 
199
268
  logger.info(
200
269
  f"uploading {len(data)} entries to {self.connection_config.db_name} "
201
270
  f"db in collection {self.upload_config.collection_name}"
202
271
  )
272
+
273
+ prepared_data = self._prepare_data_for_insert(data=data)
274
+
203
275
  with self.get_client() as client:
204
276
  try:
205
- res = client.insert(collection_name=self.upload_config.collection_name, data=data)
277
+ res = client.insert(
278
+ collection_name=self.upload_config.collection_name, data=prepared_data
279
+ )
206
280
  except MilvusException as milvus_exception:
207
281
  raise WriteError(
208
282
  f"failed to upload records to Milvus: {str(milvus_exception.message)}"
@@ -19,11 +19,11 @@ class OriginalSyncedBlock(BlockBase):
19
19
  @classmethod
20
20
  def from_dict(cls, data: dict):
21
21
  """Create OriginalSyncedBlock from dictionary data.
22
-
22
+
23
23
  Original blocks contain children content.
24
24
  """
25
25
  if "children" not in data:
26
- raise ValueError(f"OriginalSyncedBlock data missing 'children': {data}")
26
+ raise ValueError(f"OriginalSyncedBlock data missing 'children': {data}")
27
27
  return cls(children=data["children"])
28
28
 
29
29
  def get_html(self) -> Optional[HtmlTag]:
@@ -38,7 +38,7 @@ class DuplicateSyncedBlock(BlockBase):
38
38
  @staticmethod
39
39
  def can_have_children() -> bool:
40
40
  """Check if duplicate synced blocks can have children.
41
-
41
+
42
42
  Duplicate blocks themselves don't have children directly fetched here,
43
43
  but they represent content that does, so Notion API might report has_children=True
44
44
  on the parent block object. The actual children are fetched from the original block.
@@ -48,7 +48,7 @@ class DuplicateSyncedBlock(BlockBase):
48
48
  @classmethod
49
49
  def from_dict(cls, data: dict):
50
50
  """Create DuplicateSyncedBlock from dictionary data.
51
-
51
+
52
52
  Duplicate blocks contain a 'synced_from' reference.
53
53
  """
54
54
  synced_from_data = data.get("synced_from")
@@ -63,7 +63,7 @@ class DuplicateSyncedBlock(BlockBase):
63
63
 
64
64
  def get_html(self) -> Optional[HtmlTag]:
65
65
  """Get HTML representation of the duplicate synced block.
66
-
66
+
67
67
  HTML representation might need fetching the original block's content,
68
68
  which is outside the scope of this simple data class.
69
69
  """
@@ -74,7 +74,7 @@ class SyncBlock(BlockBase):
74
74
  @staticmethod
75
75
  def can_have_children() -> bool:
76
76
  """Check if synced blocks can have children.
77
-
77
+
78
78
  Synced blocks (both original and duplicate) can conceptually have children.
79
79
  """
80
80
  return True
@@ -82,7 +82,7 @@ class SyncBlock(BlockBase):
82
82
  @classmethod
83
83
  def from_dict(cls, data: dict):
84
84
  """Create appropriate SyncedBlock subclass from dictionary data.
85
-
85
+
86
86
  Determine if it's a duplicate (has 'synced_from') or original (has 'children').
87
87
  """
88
88
  if data.get("synced_from") is not None:
@@ -99,10 +99,9 @@ class SyncBlock(BlockBase):
99
99
  # Consider logging a warning here if strictness is needed.
100
100
  return OriginalSyncedBlock(children=[])
101
101
 
102
-
103
102
  def get_html(self) -> Optional[HtmlTag]:
104
103
  """Get HTML representation of the synced block.
105
-
104
+
106
105
  The specific instance returned by from_dict (Original or Duplicate)
107
106
  will handle its own get_html logic.
108
107
  This method on the base SyncBlock might not be directly called.
@@ -240,7 +240,7 @@ class PineconeUploader(VectorDBUploader):
240
240
  destination_name: str = "unstructuredautocreated",
241
241
  destination_type: Literal["pod", "serverless"] = "serverless",
242
242
  serverless_cloud: str = "aws",
243
- serverless_region: str = "us-west-2",
243
+ serverless_region: str = "us-east-1",
244
244
  pod_environment: str = "us-east1-gcp",
245
245
  pod_type: str = "p1.x1",
246
246
  pod_count: int = 1,
@@ -12,6 +12,7 @@ from unstructured_ingest.logger import logger
12
12
  from unstructured_ingest.utils.dep_check import requires_dependencies
13
13
 
14
14
  if TYPE_CHECKING:
15
+ from bs4 import BeautifulSoup
15
16
  from bs4.element import Tag
16
17
  from requests import Session
17
18
 
@@ -96,7 +97,7 @@ class HtmlMixin(BaseModel):
96
97
  from bs4 import BeautifulSoup
97
98
 
98
99
  soup = BeautifulSoup(html, "html.parser")
99
- tags = soup.find_all("a", href=True)
100
+ tags = self._find_hyperlink_tags(soup)
100
101
  hrefs = [
101
102
  tag["href"]
102
103
  for tag in tags
@@ -158,3 +159,15 @@ class HtmlMixin(BaseModel):
158
159
  )
159
160
  for url_to_download in urls_to_download
160
161
  ]
162
+
163
+ @requires_dependencies(["bs4"])
164
+ def _find_hyperlink_tags(self, html_soup: "BeautifulSoup") -> list["Tag"]:
165
+ """Find hyperlink tags in the HTML.
166
+
167
+ Overwrite this method to customize the tag search.
168
+ """
169
+ from bs4.element import Tag
170
+
171
+ return [
172
+ element for element in html_soup.find_all("a", href=True) if isinstance(element, Tag)
173
+ ]
@@ -1 +0,0 @@
1
- __version__ = "1.0.37" # pragma: no cover