classifyre-cli 0.4.11__tar.gz → 0.4.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (191) hide show
  1. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/.turbo/turbo-build.log +1 -1
  2. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/PKG-INFO +1 -1
  3. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/package.json +1 -1
  4. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/pyproject.toml +1 -1
  5. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/scripts/generate_models.py +25 -0
  6. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/broken_links/detector.py +7 -10
  7. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/custom/runners/_llm.py +1 -1
  8. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/custom/runners/_text_classification.py +1 -1
  9. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/models/generated_detectors.py +2 -2
  10. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/models/generated_input.py +166 -4
  11. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/models/generated_single_asset_scan_results.py +13 -1
  12. classifyre_cli-0.4.13/src/sources/asset_metadata.py +138 -0
  13. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/base.py +21 -0
  14. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/confluence/source.py +30 -0
  15. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/databricks/source.py +63 -8
  16. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/jira/source.py +65 -0
  17. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/mongodb/source.py +18 -0
  18. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/mysql/source.py +19 -0
  19. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/neo4j/source.py +27 -0
  20. classifyre_cli-0.4.13/src/sources/notion/__init__.py +3 -0
  21. classifyre_cli-0.4.13/src/sources/notion/client.py +223 -0
  22. classifyre_cli-0.4.13/src/sources/notion/source.py +987 -0
  23. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/object_storage/base.py +23 -0
  24. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/postgresql/source.py +27 -0
  25. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/powerbi/source.py +27 -0
  26. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/servicedesk/source.py +29 -0
  27. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/slack/source.py +12 -0
  28. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/sqlite/source.py +10 -0
  29. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/tableau/source.py +19 -0
  30. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/tabular_base.py +106 -0
  31. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/wordpress/source.py +25 -0
  32. classifyre_cli-0.4.13/src/utils/file_metadata.py +236 -0
  33. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/broken_links/test_broken_links_detector.py +87 -0
  34. classifyre_cli-0.4.13/tests/test_assets_metadata_catalog.py +73 -0
  35. classifyre_cli-0.4.13/tests/test_notion_source.py +227 -0
  36. classifyre_cli-0.4.13/tests/utils/test_file_metadata.py +79 -0
  37. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/uv.lock +224 -153
  38. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/.gitignore +0 -0
  39. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/.python-version +0 -0
  40. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/README.md +0 -0
  41. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/main.py +0 -0
  42. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/__init__.py +0 -0
  43. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/__init__.py +0 -0
  44. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/base.py +0 -0
  45. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/broken_links/__init__.py +0 -0
  46. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/config.py +0 -0
  47. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/content/__init__.py +0 -0
  48. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/custom/__init__.py +0 -0
  49. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/custom/detector.py +0 -0
  50. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/custom/extractor.py +0 -0
  51. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/custom/runners/__init__.py +0 -0
  52. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/custom/runners/_base.py +0 -0
  53. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/custom/runners/_factory.py +0 -0
  54. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/custom/runners/_feature_extraction.py +0 -0
  55. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/custom/runners/_gliner2.py +0 -0
  56. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/custom/runners/_image_classification.py +0 -0
  57. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/custom/runners/_object_detection.py +0 -0
  58. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/custom/runners/_regex.py +0 -0
  59. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/custom/trainer.py +0 -0
  60. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/dependencies.py +0 -0
  61. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/pii/__init__.py +0 -0
  62. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/pii/detector.py +0 -0
  63. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/secrets/__init__.py +0 -0
  64. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/secrets/detector.py +0 -0
  65. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/threat/__init__.py +0 -0
  66. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/threat/code_security_detector.py +0 -0
  67. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/threat/yara_detector.py +0 -0
  68. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/main.py +0 -0
  69. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/outputs/__init__.py +0 -0
  70. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/outputs/base.py +0 -0
  71. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/outputs/console.py +0 -0
  72. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/outputs/factory.py +0 -0
  73. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/outputs/file.py +0 -0
  74. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/outputs/rest.py +0 -0
  75. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/pipeline/__init__.py +0 -0
  76. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/pipeline/content_provider.py +0 -0
  77. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/pipeline/detector_pipeline.py +0 -0
  78. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/pipeline/parsed_content_provider.py +0 -0
  79. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/pipeline/worker_pool.py +0 -0
  80. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sandbox/__init__.py +0 -0
  81. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sandbox/runner.py +0 -0
  82. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/__init__.py +0 -0
  83. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/atlassian_common.py +0 -0
  84. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/azure_blob_storage/__init__.py +0 -0
  85. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/azure_blob_storage/source.py +0 -0
  86. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/confluence/__init__.py +0 -0
  87. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/databricks/__init__.py +0 -0
  88. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/dependencies.py +0 -0
  89. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/google_cloud_storage/__init__.py +0 -0
  90. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/google_cloud_storage/source.py +0 -0
  91. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/hive/__init__.py +0 -0
  92. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/hive/source.py +0 -0
  93. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/jira/__init__.py +0 -0
  94. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/mongodb/__init__.py +0 -0
  95. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/mssql/__init__.py +0 -0
  96. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/mssql/source.py +0 -0
  97. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/mysql/__init__.py +0 -0
  98. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/neo4j/__init__.py +0 -0
  99. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/oracle/__init__.py +0 -0
  100. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/oracle/source.py +0 -0
  101. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/postgresql/__init__.py +0 -0
  102. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/powerbi/__init__.py +0 -0
  103. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/recipe_normalizer.py +0 -0
  104. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/s3_compatible_storage/README.md +0 -0
  105. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/s3_compatible_storage/__init__.py +0 -0
  106. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/s3_compatible_storage/source.py +0 -0
  107. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/servicedesk/__init__.py +0 -0
  108. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/slack/__init__.py +0 -0
  109. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/snowflake/__init__.py +0 -0
  110. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/snowflake/source.py +0 -0
  111. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/sqlite/__init__.py +0 -0
  112. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/tableau/__init__.py +0 -0
  113. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/tabular_utils.py +0 -0
  114. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/wordpress/__init__.py +0 -0
  115. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/telemetry.py +0 -0
  116. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/utils/__init__.py +0 -0
  117. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/utils/content_extraction.py +0 -0
  118. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/utils/embedded_images.py +0 -0
  119. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/utils/file_parser.py +0 -0
  120. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/utils/file_to_images.py +0 -0
  121. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/utils/hashing.py +0 -0
  122. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/utils/uv_sync.py +0 -0
  123. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/utils/validation.py +0 -0
  124. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/__init__.py +0 -0
  125. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/conftest.py +0 -0
  126. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/__init__.py +0 -0
  127. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/conftest.py +0 -0
  128. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/content/__init__.py +0 -0
  129. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/custom/__init__.py +0 -0
  130. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/custom/conftest.py +0 -0
  131. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/custom/test_invoice_extraction.py +0 -0
  132. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/custom/test_llm_runner.py +0 -0
  133. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/custom/test_pipeline_integration.py +0 -0
  134. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/custom/test_regex_runner.py +0 -0
  135. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/custom/test_transformer_runners.py +0 -0
  136. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/pii/__init__.py +0 -0
  137. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/pii/conftest.py +0 -0
  138. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/pii/sample_invoice.pdf +0 -0
  139. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/pii/test_pii_detector.py +0 -0
  140. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/pii/test_pii_detector_extended.py +0 -0
  141. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/secrets/__init__.py +0 -0
  142. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/secrets/test_secrets_detector.py +0 -0
  143. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/secrets/test_secrets_detector_extended.py +0 -0
  144. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/test_base_detector.py +0 -0
  145. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/test_custom_detector_examples_runtime.py +0 -0
  146. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/test_detector_catalog_commercial.py +0 -0
  147. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/test_detector_pipeline_types.py +0 -0
  148. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/test_detector_schema_examples.py +0 -0
  149. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/test_detector_types.py +0 -0
  150. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/test_phase2_detectors.py +0 -0
  151. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/test_registry.py +0 -0
  152. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/threat/__init__.py +0 -0
  153. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/threat/test_code_security_detector.py +0 -0
  154. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/threat/test_yara_detector.py +0 -0
  155. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/integration/test_wordpress_broken_links_detector.py +0 -0
  156. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/integration/test_wordpress_links_assets.py +0 -0
  157. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/pipeline/test_detector_pipeline.py +0 -0
  158. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/pipeline/test_worker_pool.py +0 -0
  159. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_azure_blob_storage_source.py +0 -0
  160. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_base_source_attachment.py +0 -0
  161. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_base_source_sampling.py +0 -0
  162. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_confluence_source.py +0 -0
  163. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_custom_extractor.py +0 -0
  164. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_databricks_source.py +0 -0
  165. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_google_cloud_storage_source.py +0 -0
  166. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_hashing.py +0 -0
  167. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_hive_source.py +0 -0
  168. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_jira_source.py +0 -0
  169. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_mongodb_source.py +0 -0
  170. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_mssql_source.py +0 -0
  171. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_mysql_source.py +0 -0
  172. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_neo4j_source.py +0 -0
  173. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_oracle_source.py +0 -0
  174. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_outputs.py +0 -0
  175. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_postgresql_source.py +0 -0
  176. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_powerbi_source.py +0 -0
  177. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_recipe_normalizer.py +0 -0
  178. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_s3_compatible_storage_source.py +0 -0
  179. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_sandbox_runner.py +0 -0
  180. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_servicedesk_source.py +0 -0
  181. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_slack_source.py +0 -0
  182. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_snowflake_source.py +0 -0
  183. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_source_dependency_groups.py +0 -0
  184. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_sqlite_source.py +0 -0
  185. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_tableau_source.py +0 -0
  186. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_tabular_utils.py +0 -0
  187. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_wordpress_source.py +0 -0
  188. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/utils/test_content_extraction.py +0 -0
  189. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/utils/test_embedded_images.py +0 -0
  190. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/utils/test_file_parser.py +0 -0
  191. {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/utils/test_file_to_images.py +0 -0
@@ -1,3 +1,3 @@
1
1
  $ uv sync
2
- Resolved 265 packages in 157ms
2
+ Resolved 268 packages in 156ms
3
3
  Checked 50 packages in 1ms
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: classifyre-cli
3
- Version: 0.4.11
3
+ Version: 0.4.13
4
4
  Summary: Classifyre CLI — scan and classify unstructured data sources
5
5
  License: MIT
6
6
  Keywords: data,ingestion,metadata,pii,secrets,unstructured
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@classifyre/cli",
3
- "version": "0.4.11",
3
+ "version": "0.4.13",
4
4
  "private": true,
5
5
  "scripts": {
6
6
  "build": "uv sync",
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "classifyre-cli"
3
- version = "0.4.11"
3
+ version = "0.4.13"
4
4
  description = "Classifyre CLI — scan and classify unstructured data sources"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -16,6 +16,14 @@ _PIPELINE_TYPE_DEFAULTS: dict[str, str] = {
16
16
  "LLMPipelineSchema": "LLM",
17
17
  }
18
18
 
19
+ # Pipeline schema classes whose `severity` field has a string default from JSON schema
20
+ # ('info') but must be an enum instance to avoid Pydantic serialization warnings
21
+ # ("Expected `enum` - serialized value may not be as expected").
22
+ _SEVERITY_ENUM_DEFAULT_CLASSES = {
23
+ "LLMPipelineSchema",
24
+ "TextClassificationPipelineSchema",
25
+ }
26
+
19
27
 
20
28
  def _patch_pipeline_type_defaults(source: str) -> str:
21
29
  """Add `= 'X'` default to discriminator `type` fields on pipeline schemas."""
@@ -28,6 +36,22 @@ def _patch_pipeline_type_defaults(source: str) -> str:
28
36
  return source
29
37
 
30
38
 
39
+ def _patch_severity_enum_defaults(source: str) -> str:
40
+ """Replace string 'info' severity Field defaults with Severity.info enum instances.
41
+
42
+ datamodel-codegen emits Field('info', ...) from the JSON schema default, but
43
+ Pydantic v2 warns at serialization time when the stored value is a plain string
44
+ rather than a Severity enum member. This patch rewrites only the severity field
45
+ inside each affected class so the fix survives future codegen runs.
46
+ """
47
+ for cls_name in _SEVERITY_ENUM_DEFAULT_CLASSES:
48
+ # Match from the class definition up through the severity Field default string.
49
+ pattern = rf"(class {re.escape(cls_name)}\(.*?severity: Severity \| None = Field\(\n\s+)'info'(\s*,)"
50
+ replacement = rf"\1Severity.info\2"
51
+ source = re.sub(pattern, replacement, source, flags=re.DOTALL)
52
+ return source
53
+
54
+
31
55
  def run_codegen(input_file):
32
56
  """Generate Pydantic models from a single JSON schema file."""
33
57
  cmd = [
@@ -74,6 +98,7 @@ def main():
74
98
  detector_schema = SCHEMA_DIR / "all_detectors.json"
75
99
  content = run_codegen(detector_schema)
76
100
  content = _patch_pipeline_type_defaults(content)
101
+ content = _patch_severity_enum_defaults(content)
77
102
  (MODEL_DIR / "generated_detectors.py").write_text(content)
78
103
  print("Wrote src/models/generated_detectors.py")
79
104
 
@@ -156,6 +156,10 @@ class BrokenLinksDetector(BaseDetector):
156
156
  if status_code in {405, 501}:
157
157
  return self._scan_with_get(url, line, start, end, "head_not_supported")
158
158
 
159
+ # Some servers block HEAD (403) but serve content via GET.
160
+ if status_code == 403:
161
+ return self._scan_with_get(url, line, start, end, "head_forbidden")
162
+
159
163
  if status_code >= 400:
160
164
  return LinkScanResult(
161
165
  url=url,
@@ -169,17 +173,10 @@ class BrokenLinksDetector(BaseDetector):
169
173
 
170
174
  content_length = self._parse_content_length(head_response.headers)
171
175
  if content_length == 0:
172
- return LinkScanResult(
173
- url=url,
174
- line=line,
175
- start=start,
176
- end=end,
177
- finding_type="empty_content",
178
- confidence=0.9,
179
- metadata={"status_code": status_code, "reason": "empty_head_content_length"},
180
- )
176
+ # Content-Length: 0 on HEAD can be misleading (e.g., YouTube, TikTok).
177
+ # Fall back to GET to verify the page truly has no content.
178
+ return self._scan_with_get(url, line, start, end, "empty_head_content_length")
181
179
 
182
- # Some servers omit Content-Length, so perform a lightweight GET check.
183
180
  if content_length is None:
184
181
  return self._scan_with_get(url, line, start, end, "missing_content_length")
185
182
 
@@ -241,7 +241,7 @@ class LLMRunner(BaseRunner):
241
241
  ) -> list[DetectionResult]:
242
242
  schema = self._schema
243
243
  threshold = schema.confidence_threshold if schema.confidence_threshold is not None else 0.5
244
- default_severity = schema.severity or Severity.info
244
+ default_severity = schema.severity if schema.severity is not None else Severity.info
245
245
  extracted = self._coerce_fields(payload.get("fields"))
246
246
 
247
247
  raw_labels = payload.get("labels")
@@ -66,7 +66,7 @@ class TextClassificationRunner(BaseRunner):
66
66
  chunk_overlap: int = getattr(schema.chunk_overlap, "root", schema.chunk_overlap) or 0
67
67
  max_length: int | None = getattr(schema.max_length, "root", schema.max_length)
68
68
  threshold = schema.confidence_threshold if schema.confidence_threshold is not None else 0.7
69
- default_severity = schema.severity or Severity.info
69
+ default_severity = schema.severity if schema.severity is not None else Severity.info
70
70
 
71
71
  best_scores: dict[str, float] = {}
72
72
  try:
@@ -1080,7 +1080,7 @@ class LLMPipelineSchema(BaseModel):
1080
1080
  False, description='Allow more than one label per asset.'
1081
1081
  )
1082
1082
  severity: Severity | None = Field(
1083
- 'info',
1083
+ Severity.info,
1084
1084
  description='Default severity when no severity_map rule matches a predicted label.',
1085
1085
  )
1086
1086
  severity_map: list[PipelineSeverityRule] | None = Field(
@@ -1177,7 +1177,7 @@ class TextClassificationPipelineSchema(BaseModel):
1177
1177
  le=1.0,
1178
1178
  )
1179
1179
  severity: Severity | None = Field(
1180
- 'info', description='Default severity when no severity_map rule matches.'
1180
+ Severity.info, description='Default severity when no severity_map rule matches.'
1181
1181
  )
1182
1182
  severity_map: list[PipelineSeverityRule] | None = Field(
1183
1183
  None,
@@ -42,6 +42,7 @@ class AssetType(StrEnum):
42
42
  JIRA = 'JIRA'
43
43
  SERVICEDESK = 'SERVICEDESK'
44
44
  SQLITE = 'SQLITE'
45
+ NOTION = 'NOTION'
45
46
 
46
47
 
47
48
  class SourceCategory(StrEnum):
@@ -1078,9 +1079,10 @@ class DatabricksAuthMode(StrEnum):
1078
1079
 
1079
1080
  PAT_TOKEN = 'PAT_TOKEN'
1080
1081
  SERVICE_PRINCIPAL = 'SERVICE_PRINCIPAL'
1082
+ AZURE_SERVICE_PRINCIPAL = 'AZURE_SERVICE_PRINCIPAL'
1081
1083
 
1082
1084
 
1083
- class DatabricksRequiredPat(BaseModel):
1085
+ class PersonalAccessToken(BaseModel):
1084
1086
  model_config = ConfigDict(
1085
1087
  extra='forbid',
1086
1088
  )
@@ -1094,7 +1096,7 @@ class DatabricksRequiredPat(BaseModel):
1094
1096
  )
1095
1097
 
1096
1098
 
1097
- class DatabricksRequiredServicePrincipal(BaseModel):
1099
+ class ServicePrincipalOAuthM2M(BaseModel):
1098
1100
  model_config = ConfigDict(
1099
1101
  extra='forbid',
1100
1102
  )
@@ -1109,6 +1111,24 @@ class DatabricksRequiredServicePrincipal(BaseModel):
1109
1111
  client_id: str = Field(..., description='Databricks service principal client ID')
1110
1112
 
1111
1113
 
1114
+ class AzureServicePrincipal(BaseModel):
1115
+ model_config = ConfigDict(
1116
+ extra='forbid',
1117
+ )
1118
+ auth_mode: Literal['AZURE_SERVICE_PRINCIPAL']
1119
+ workspace_url: AnyUrl = Field(
1120
+ ...,
1121
+ description='Azure Databricks workspace URL (for example, https://adb-1234567890123456.7.azuredatabricks.net)',
1122
+ )
1123
+ warehouse_id: str = Field(
1124
+ ..., description='Databricks SQL warehouse ID used for sampling queries'
1125
+ )
1126
+ client_id: str = Field(
1127
+ ..., description='Azure AD application (client) ID for the service principal'
1128
+ )
1129
+ tenant_id: str = Field(..., description='Azure AD tenant ID')
1130
+
1131
+
1112
1132
  class DatabricksMaskedPat(BaseModel):
1113
1133
  model_config = ConfigDict(
1114
1134
  extra='forbid',
@@ -1125,6 +1145,15 @@ class DatabricksMaskedServicePrincipal(BaseModel):
1125
1145
  )
1126
1146
 
1127
1147
 
1148
+ class DatabricksMaskedAzureServicePrincipal(BaseModel):
1149
+ model_config = ConfigDict(
1150
+ extra='forbid',
1151
+ )
1152
+ client_secret: str = Field(
1153
+ ..., description='Azure AD client secret for the service principal'
1154
+ )
1155
+
1156
+
1128
1157
  class DatabricksOptionalConnection(BaseModel):
1129
1158
  """
1130
1159
  Databricks API and SQL statement execution tuning options.
@@ -1842,6 +1871,7 @@ class Type(StrEnum):
1842
1871
  JIRA = 'JIRA'
1843
1872
  SERVICEDESK = 'SERVICEDESK'
1844
1873
  SQLITE = 'SQLITE'
1874
+ NOTION = 'NOTION'
1845
1875
 
1846
1876
 
1847
1877
  class SlackInput(CoreInput):
@@ -2020,8 +2050,8 @@ class DatabricksInput(CoreInput):
2020
2050
  type: Literal['DATABRICKS'] = Field(
2021
2051
  'DATABRICKS', description='Type of the asset or source'
2022
2052
  )
2023
- required: DatabricksRequiredPat | DatabricksRequiredServicePrincipal = Field(
2024
- ..., title='DatabricksRequired'
2053
+ required: PersonalAccessToken | ServicePrincipalOAuthM2M | AzureServicePrincipal = (
2054
+ Field(..., title='DatabricksRequired')
2025
2055
  )
2026
2056
  masked: DatabricksMaskedPat | DatabricksMaskedServicePrincipal = Field(
2027
2057
  ..., title='DatabricksMasked'
@@ -2629,6 +2659,7 @@ class Type17(StrEnum):
2629
2659
  JIRA = 'JIRA'
2630
2660
  SERVICEDESK = 'SERVICEDESK'
2631
2661
  SQLITE = 'SQLITE'
2662
+ NOTION = 'NOTION'
2632
2663
 
2633
2664
 
2634
2665
  class ConfluenceInput(CoreInput):
@@ -2737,6 +2768,135 @@ class SQLiteInput(CoreInput):
2737
2768
  resources: ResourceOverrides | None = None
2738
2769
 
2739
2770
 
2771
+ class NotionRequired(BaseModel):
2772
+ """
2773
+ Notion has no required connection fields; the integration token lives in the masked section.
2774
+ """
2775
+
2776
+ model_config = ConfigDict(
2777
+ extra='forbid',
2778
+ )
2779
+
2780
+
2781
+ class NotionMasked(BaseModel):
2782
+ model_config = ConfigDict(
2783
+ extra='forbid',
2784
+ )
2785
+ notion_token: str = Field(
2786
+ ...,
2787
+ description='Notion API token used as a Bearer credential. Accepts an internal integration secret (ntn_...) or an OAuth public-integration access token.',
2788
+ )
2789
+
2790
+
2791
+ class NotionOptionalConnection(BaseModel):
2792
+ """
2793
+ HTTP, version, and retry settings for Notion API calls.
2794
+ """
2795
+
2796
+ model_config = ConfigDict(
2797
+ extra='forbid',
2798
+ )
2799
+ notion_version: str | None = Field(
2800
+ '2025-09-03',
2801
+ description='Notion-Version header sent with every request. Defaults to the data-sources API version.',
2802
+ )
2803
+ request_timeout_seconds: float | None = Field(
2804
+ 30, description='HTTP request timeout for Notion API calls', ge=1.0
2805
+ )
2806
+ rate_limit_delay_seconds: float | None = Field(
2807
+ 0,
2808
+ description='Additional delay between API requests to reduce rate-limit pressure',
2809
+ ge=0.0,
2810
+ )
2811
+ max_retries: int | None = Field(
2812
+ 3,
2813
+ description='Maximum retry attempts for transient API failures and rate limits',
2814
+ ge=0,
2815
+ le=10,
2816
+ )
2817
+
2818
+
2819
+ class NotionOptionalScope(BaseModel):
2820
+ """
2821
+ Optional Notion scope filters. When omitted, all content shared with the integration is eligible for sampling.
2822
+ """
2823
+
2824
+ model_config = ConfigDict(
2825
+ extra='forbid',
2826
+ )
2827
+ page_ids: list[str] | None = Field(
2828
+ None,
2829
+ description='Restrict extraction to specific page IDs (up to 250)',
2830
+ max_length=250,
2831
+ )
2832
+ data_source_ids: list[str] | None = Field(
2833
+ None,
2834
+ description='Restrict extraction to specific data source IDs (up to 250)',
2835
+ max_length=250,
2836
+ )
2837
+ search_query: str | None = Field(
2838
+ None,
2839
+ description='Optional full-text query passed to the Notion search endpoint to narrow discovery',
2840
+ min_length=1,
2841
+ )
2842
+
2843
+
2844
+ class NotionOptionalContent(BaseModel):
2845
+ """
2846
+ Notion content extraction controls.
2847
+ """
2848
+
2849
+ model_config = ConfigDict(
2850
+ extra='forbid',
2851
+ )
2852
+ include_comments: bool | None = Field(
2853
+ True,
2854
+ description='Include page and block comments and aggregate them into a per-page comments asset',
2855
+ )
2856
+ include_files: bool | None = Field(
2857
+ True,
2858
+ description='Materialize files from file/image/pdf/video blocks, file properties, and page icon/cover as related assets',
2859
+ )
2860
+ include_linked_pages: bool | None = Field(
2861
+ True,
2862
+ description='Wire parent, relation, and mention references between pages into the asset links graph',
2863
+ )
2864
+ include_data_sources: bool | None = Field(
2865
+ True,
2866
+ description='Emit Notion data sources (databases) as assets with their schema and link their row pages',
2867
+ )
2868
+ file_max_bytes: int | None = Field(
2869
+ 5242880,
2870
+ description='Maximum bytes downloaded per file for MIME inference and text extraction',
2871
+ ge=1024,
2872
+ )
2873
+
2874
+
2875
+ class NotionOptional(BaseModel):
2876
+ model_config = ConfigDict(
2877
+ extra='forbid',
2878
+ )
2879
+ connection: NotionOptionalConnection | None = None
2880
+ scope: NotionOptionalScope | None = None
2881
+ content: NotionOptionalContent | None = None
2882
+
2883
+
2884
+ class NotionInput(CoreInput):
2885
+ type: Literal['NOTION'] = Field('NOTION', description='Type of the asset or source')
2886
+ required: NotionRequired
2887
+ masked: NotionMasked
2888
+ optional: NotionOptional | None = None
2889
+ detectors: list[Detector] | None = Field(
2890
+ None, description='Detectors to run on ingested content'
2891
+ )
2892
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
2893
+ None,
2894
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
2895
+ )
2896
+ sampling: SamplingConfig
2897
+ resources: ResourceOverrides | None = None
2898
+
2899
+
2740
2900
  class SourceInput(
2741
2901
  RootModel[
2742
2902
  SlackInput
@@ -2759,6 +2919,7 @@ class SourceInput(
2759
2919
  | JiraInput
2760
2920
  | ServiceDeskInput
2761
2921
  | SQLiteInput
2922
+ | NotionInput
2762
2923
  ]
2763
2924
  ):
2764
2925
  root: (
@@ -2782,6 +2943,7 @@ class SourceInput(
2782
2943
  | JiraInput
2783
2944
  | ServiceDeskInput
2784
2945
  | SQLiteInput
2946
+ | NotionInput
2785
2947
  ) = Field(
2786
2948
  ...,
2787
2949
  description='Merged configuration schema with all source types and common definitions',
@@ -210,7 +210,14 @@ class SingleAssetScanResults(BaseModel):
210
210
  ..., description='Linked asset hashes referenced by this asset', title='Links'
211
211
  )
212
212
  asset_type: AssetType = Field(
213
- ..., description='Canonical asset content type', title='Asset Type'
213
+ ...,
214
+ description='Canonical asset content type (used for detector routing)',
215
+ title='Asset Type',
216
+ )
217
+ asset_kind: str | None = Field(
218
+ None,
219
+ description='Catalog asset kind discriminator (file, image, page, comment, table, ...). Persisted as the asset type for display/filtering.',
220
+ title='Asset Kind',
214
221
  )
215
222
  source_id: str | None = Field(
216
223
  None,
@@ -238,3 +245,8 @@ class SingleAssetScanResults(BaseModel):
238
245
  description='Statistics about the detector scan for this asset',
239
246
  title='Scan Stats',
240
247
  )
248
+ metadata: dict[str, Any] | None = Field(
249
+ None,
250
+ description='Source-specific asset metadata using normalized keys (size_bytes, row_count, etc.) where applicable',
251
+ title='Metadata',
252
+ )
@@ -0,0 +1,138 @@
1
+ """Asset-metadata contract: the single source of truth for what each source
2
+ extracts is the ``x-asset-metadata`` catalog embedded in
3
+ ``packages/schemas/src/schemas/all_input_sources.json``.
4
+
5
+ This module loads/resolves that catalog and validates metadata dicts against it.
6
+ Validation is strict (raises) under pytest or when ``CLASSIFYRE_STRICT_METADATA``
7
+ is set, and otherwise logs a warning during real ingestion — so drift between a
8
+ source's emitted keys and the declared catalog is caught either in CI or at runtime.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import logging
14
+ import os
15
+ from functools import cache
16
+ from typing import Any
17
+
18
+ from ..utils.validation import _load_schema
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ _CATALOG_KEY = "x-asset-metadata"
23
+
24
+ ResolvedField = dict[str, Any] # {name, type, description, required}
25
+
26
+
27
+ class AssetMetadataContractError(AssertionError):
28
+ """Raised (in strict mode) when emitted metadata violates the catalog."""
29
+
30
+
31
+ def _strict_mode() -> bool:
32
+ return bool(
33
+ os.environ.get("PYTEST_CURRENT_TEST") or os.environ.get("CLASSIFYRE_STRICT_METADATA")
34
+ )
35
+
36
+
37
+ @cache
38
+ def load_catalog() -> dict[str, Any]:
39
+ """Load and cache the ``x-asset-metadata`` catalog from the merged schema."""
40
+ schema = _load_schema("all_input_sources.json")
41
+ catalog = schema.get(_CATALOG_KEY)
42
+ if not isinstance(catalog, dict):
43
+ raise AssetMetadataContractError(
44
+ f"Missing '{_CATALOG_KEY}' catalog in all_input_sources.json"
45
+ )
46
+ return catalog
47
+
48
+
49
+ def _source_key(source_type: str) -> str:
50
+ # Catalog keys mirror the AssetType enum (uppercased source_type).
51
+ return source_type.upper()
52
+
53
+
54
+ def describe_type(prop_schema: dict[str, Any]) -> str:
55
+ """Render a JSON-Schema property type as a short display string."""
56
+ json_type = prop_schema.get("type")
57
+ if json_type == "array":
58
+ items = prop_schema.get("items", {})
59
+ item_type = items.get("type", "string") if isinstance(items, dict) else "string"
60
+ return f"{item_type}[]"
61
+ return str(json_type) if json_type else "string"
62
+
63
+
64
+ def resolve_fields(source_type: str, asset_kind: str) -> list[ResolvedField]:
65
+ """Resolve the declared fields for a (source, asset kind).
66
+
67
+ The asset entry composes one or more reusable ``contentTypes`` via ``use``
68
+ plus its own ``properties``; ``required`` is the union of each used content
69
+ type's required list and the entry's own. Raises if the entry is absent.
70
+ """
71
+ catalog = load_catalog()
72
+ sources = catalog.get("sources", {})
73
+ source_entry = sources.get(_source_key(source_type))
74
+ if not isinstance(source_entry, dict) or asset_kind not in source_entry:
75
+ raise AssetMetadataContractError(
76
+ f"No catalog entry for source '{source_type}' asset kind '{asset_kind}'"
77
+ )
78
+ entry = source_entry[asset_kind]
79
+ content_types = catalog.get("contentTypes", {})
80
+
81
+ properties: dict[str, dict[str, Any]] = {}
82
+ required: set[str] = set()
83
+
84
+ for content_type_name in entry.get("use", []):
85
+ content_type = content_types.get(content_type_name, {})
86
+ properties.update(content_type.get("properties", {}))
87
+ required.update(content_type.get("required", []))
88
+
89
+ properties.update(entry.get("properties", {}))
90
+ required.update(entry.get("required", []))
91
+
92
+ return [
93
+ {
94
+ "name": name,
95
+ "type": describe_type(prop),
96
+ "description": prop.get("description", ""),
97
+ "required": name in required,
98
+ }
99
+ for name, prop in properties.items()
100
+ ]
101
+
102
+
103
+ def validate_metadata(
104
+ source_type: str,
105
+ asset_kind: str,
106
+ data: dict[str, Any],
107
+ ) -> dict[str, Any]:
108
+ """Validate an emitted metadata dict against the catalog and return it.
109
+
110
+ Strict mode raises ``AssetMetadataContractError``; otherwise it logs a
111
+ warning. Checks: no undeclared keys, and every required field is present
112
+ with a non-null value.
113
+ """
114
+ try:
115
+ fields = resolve_fields(source_type, asset_kind)
116
+ except AssetMetadataContractError as exc:
117
+ if _strict_mode():
118
+ raise
119
+ logger.warning("Asset metadata contract: %s", exc)
120
+ return data
121
+
122
+ declared = {field["name"] for field in fields}
123
+ required = {field["name"] for field in fields if field["required"]}
124
+ present_non_null = {key for key, value in data.items() if value is not None}
125
+
126
+ undeclared = sorted(set(data) - declared)
127
+ missing_required = sorted(required - present_non_null)
128
+
129
+ if undeclared or missing_required:
130
+ message = (
131
+ f"[{source_type}/{asset_kind}] "
132
+ f"undeclared={undeclared} missing_required={missing_required}"
133
+ )
134
+ if _strict_mode():
135
+ raise AssetMetadataContractError(message)
136
+ logger.warning("Asset metadata contract drift: %s", message)
137
+
138
+ return data
@@ -17,6 +17,11 @@ class BaseSource(ABC):
17
17
  Abstract base class for all metadata extraction sources.
18
18
  """
19
19
 
20
+ # Stable source identifier, overridden by each concrete source (e.g.
21
+ # "postgresql", "wordpress"). Uppercased it maps to the AssetType enum and
22
+ # the x-asset-metadata catalog key.
23
+ source_type: str = ""
24
+
20
25
  # Default batch size for streaming asset results
21
26
  BATCH_SIZE: int = 50
22
27
  HAS_SUCCESSFUL_RUN_ENV = "CLASSIFYRE_SOURCE_HAS_SUCCESSFUL_RUN"
@@ -130,6 +135,22 @@ class BaseSource(ABC):
130
135
  """
131
136
  return calculate_checksum(data)
132
137
 
138
+ def metadata_fields(self, asset_kind: str, data: dict[str, Any]) -> dict[str, Any]:
139
+ """Build the ``asset_kind`` + ``metadata`` kwargs for SingleAssetScanResults.
140
+
141
+ Spread into the constructor: ``**self.metadata_fields("page", {...})``.
142
+ ``asset_kind`` is the catalog discriminator (persisted as the asset type
143
+ for display); ``metadata`` is validated against ``x-asset-metadata`` for
144
+ this source/kind — strict (raises) under pytest / ``CLASSIFYRE_STRICT_METADATA``,
145
+ otherwise a warning.
146
+ """
147
+ from .asset_metadata import validate_metadata
148
+
149
+ return {
150
+ "asset_kind": asset_kind,
151
+ "metadata": validate_metadata(self.source_type, asset_kind, data),
152
+ }
153
+
133
154
  @abstractmethod
134
155
  def abort(self) -> None:
135
156
  """
@@ -313,6 +313,20 @@ class ConfluenceSource(BaseSource):
313
313
  "status": page.get("status"),
314
314
  "links_count": len(related_hashes),
315
315
  }
316
+ asset_metadata: dict[str, Any] = {
317
+ "page_id": page_id,
318
+ "title": title,
319
+ "links_count": len(related_hashes),
320
+ }
321
+ space_id = page.get("spaceId")
322
+ if space_id is not None:
323
+ asset_metadata["space_key"] = str(space_id)
324
+ status = page.get("status")
325
+ if isinstance(status, str) and status:
326
+ asset_metadata["status"] = status
327
+ author_id = page.get("authorId")
328
+ if isinstance(author_id, str) and author_id:
329
+ asset_metadata["author"] = author_id
316
330
  page_asset = SingleAssetScanResults(
317
331
  hash=page_hash,
318
332
  checksum=self.calculate_checksum(page_metadata),
@@ -330,6 +344,7 @@ class ConfluenceSource(BaseSource):
330
344
  )
331
345
  ),
332
346
  runner_id=self.runner_id,
347
+ **self.metadata_fields("page", asset_metadata),
333
348
  )
334
349
 
335
350
  return [page_asset, *related_assets]
@@ -368,6 +383,15 @@ class ConfluenceSource(BaseSource):
368
383
  if download_url:
369
384
  self._attachment_download_url_by_hash[attachment_hash] = download_url
370
385
 
386
+ attachment_metadata: dict[str, Any] = {
387
+ "title": attachment_name,
388
+ "page_hash": page_hash,
389
+ }
390
+ if mime:
391
+ attachment_metadata["mime_type"] = mime
392
+ file_size = attachment.get("fileSize")
393
+ if isinstance(file_size, int):
394
+ attachment_metadata["size_bytes"] = file_size
371
395
  assets.append(
372
396
  SingleAssetScanResults(
373
397
  hash=attachment_hash,
@@ -380,6 +404,7 @@ class ConfluenceSource(BaseSource):
380
404
  created_at=now,
381
405
  updated_at=now,
382
406
  runner_id=self.runner_id,
407
+ **self.metadata_fields("attachment", attachment_metadata),
383
408
  )
384
409
  )
385
410
  hashes.append(attachment_hash)
@@ -453,6 +478,10 @@ class ConfluenceSource(BaseSource):
453
478
  created_at=now,
454
479
  updated_at=now,
455
480
  runner_id=self.runner_id,
481
+ **self.metadata_fields(
482
+ "comments",
483
+ {"page_id": page_id, "comments_count": len(comment_items)},
484
+ ),
456
485
  )
457
486
  return comments_asset, [comments_hash]
458
487
 
@@ -578,6 +607,7 @@ class ConfluenceSource(BaseSource):
578
607
  created_at=now,
579
608
  updated_at=now,
580
609
  runner_id=self.runner_id,
610
+ **self.metadata_fields("linked_file", {"referenced_by": page_hash}),
581
611
  )
582
612
 
583
613
  def _display_name_from_url(self, url: str) -> str: