classifyre-cli 0.4.29__tar.gz → 0.4.31__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (209) hide show
  1. classifyre_cli-0.4.31/.turbo/turbo-build.log +3 -0
  2. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/PKG-INFO +1 -1
  3. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/package.json +1 -1
  4. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/pyproject.toml +1 -1
  5. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/main.py +6 -0
  6. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/models/generated_input.py +4 -3
  7. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/outputs/rest.py +12 -1
  8. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/base.py +120 -1
  9. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/confluence/source.py +14 -8
  10. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/databricks/source.py +5 -0
  11. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/email/source.py +62 -0
  12. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/jira/source.py +18 -12
  13. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/mongodb/source.py +8 -0
  14. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/neo4j/source.py +8 -0
  15. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/notion/source.py +12 -6
  16. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/object_storage/base.py +5 -0
  17. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/powerbi/source.py +17 -0
  18. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/recipe_normalizer.py +2 -2
  19. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/servicedesk/source.py +5 -0
  20. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/slack/source.py +59 -30
  21. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/snowflake/source.py +9 -1
  22. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/tableau/source.py +17 -0
  23. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/tabular_base.py +124 -1
  24. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/wordpress/source.py +70 -1
  25. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/youtube/source.py +3 -0
  26. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_outputs.py +27 -0
  27. classifyre_cli-0.4.31/tests/test_sampling_automatic.py +143 -0
  28. classifyre_cli-0.4.31/tests/test_tabular_automatic_sampling.py +188 -0
  29. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/uv.lock +226 -192
  30. classifyre_cli-0.4.29/.turbo/turbo-build.log +0 -3
  31. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/.gitignore +0 -0
  32. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/.python-version +0 -0
  33. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/README.md +0 -0
  34. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/main.py +0 -0
  35. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/scripts/generate_models.py +0 -0
  36. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/__init__.py +0 -0
  37. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/config.py +0 -0
  38. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/detectors/__init__.py +0 -0
  39. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/detectors/base.py +0 -0
  40. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/detectors/broken_links/__init__.py +0 -0
  41. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/detectors/broken_links/detector.py +0 -0
  42. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/detectors/config.py +0 -0
  43. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/detectors/content/__init__.py +0 -0
  44. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/detectors/custom/__init__.py +0 -0
  45. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/detectors/custom/detector.py +0 -0
  46. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/detectors/custom/extractor.py +0 -0
  47. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/detectors/custom/runners/__init__.py +0 -0
  48. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/detectors/custom/runners/_base.py +0 -0
  49. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/detectors/custom/runners/_factory.py +0 -0
  50. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/detectors/custom/runners/_feature_extraction.py +0 -0
  51. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/detectors/custom/runners/_gliner2.py +0 -0
  52. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/detectors/custom/runners/_image_classification.py +0 -0
  53. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/detectors/custom/runners/_llm.py +0 -0
  54. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/detectors/custom/runners/_object_detection.py +0 -0
  55. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/detectors/custom/runners/_regex.py +0 -0
  56. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/detectors/custom/runners/_text_classification.py +0 -0
  57. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/detectors/custom/trainer.py +0 -0
  58. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/detectors/dependencies.py +0 -0
  59. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/detectors/pii/__init__.py +0 -0
  60. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/detectors/pii/detector.py +0 -0
  61. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/detectors/secrets/__init__.py +0 -0
  62. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/detectors/secrets/detector.py +0 -0
  63. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/detectors/threat/__init__.py +0 -0
  64. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/detectors/threat/code_security_detector.py +0 -0
  65. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/detectors/threat/yara_detector.py +0 -0
  66. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/models/generated_detectors.py +0 -0
  67. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/models/generated_single_asset_scan_results.py +0 -0
  68. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/outputs/__init__.py +0 -0
  69. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/outputs/base.py +0 -0
  70. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/outputs/console.py +0 -0
  71. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/outputs/factory.py +0 -0
  72. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/outputs/file.py +0 -0
  73. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/pipeline/__init__.py +0 -0
  74. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/pipeline/content_provider.py +0 -0
  75. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/pipeline/detector_pipeline.py +0 -0
  76. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/pipeline/parsed_content_provider.py +0 -0
  77. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/pipeline/worker_pool.py +0 -0
  78. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sandbox/__init__.py +0 -0
  79. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sandbox/runner.py +0 -0
  80. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/__init__.py +0 -0
  81. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/asset_metadata.py +0 -0
  82. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/atlassian_common.py +0 -0
  83. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/azure_blob_storage/__init__.py +0 -0
  84. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/azure_blob_storage/source.py +0 -0
  85. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/confluence/__init__.py +0 -0
  86. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/databricks/__init__.py +0 -0
  87. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/dependencies.py +0 -0
  88. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/email/__init__.py +0 -0
  89. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/google_cloud_storage/__init__.py +0 -0
  90. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/google_cloud_storage/source.py +0 -0
  91. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/hive/__init__.py +0 -0
  92. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/hive/source.py +0 -0
  93. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/jira/__init__.py +0 -0
  94. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/mongodb/__init__.py +0 -0
  95. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/mssql/__init__.py +0 -0
  96. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/mssql/source.py +0 -0
  97. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/mysql/__init__.py +0 -0
  98. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/mysql/source.py +0 -0
  99. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/neo4j/__init__.py +0 -0
  100. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/notion/__init__.py +0 -0
  101. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/notion/client.py +0 -0
  102. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/oracle/__init__.py +0 -0
  103. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/oracle/source.py +0 -0
  104. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/postgresql/__init__.py +0 -0
  105. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/postgresql/source.py +0 -0
  106. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/powerbi/__init__.py +0 -0
  107. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/s3_compatible_storage/README.md +0 -0
  108. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/s3_compatible_storage/__init__.py +0 -0
  109. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/s3_compatible_storage/source.py +0 -0
  110. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/servicedesk/__init__.py +0 -0
  111. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/slack/__init__.py +0 -0
  112. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/snowflake/__init__.py +0 -0
  113. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/sqlite/__init__.py +0 -0
  114. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/sqlite/source.py +0 -0
  115. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/tableau/__init__.py +0 -0
  116. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/tabular_utils.py +0 -0
  117. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/wordpress/__init__.py +0 -0
  118. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/youtube/__init__.py +0 -0
  119. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/telemetry.py +0 -0
  120. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/utils/__init__.py +0 -0
  121. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/utils/content_extraction.py +0 -0
  122. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/utils/dependency_groups.py +0 -0
  123. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/utils/embedded_images.py +0 -0
  124. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/utils/file_metadata.py +0 -0
  125. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/utils/file_parser.py +0 -0
  126. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/utils/file_to_images.py +0 -0
  127. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/utils/hashing.py +0 -0
  128. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/utils/resources.py +0 -0
  129. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/utils/transcription.py +0 -0
  130. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/utils/uv_sync.py +0 -0
  131. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/utils/validation.py +0 -0
  132. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/__init__.py +0 -0
  133. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/conftest.py +0 -0
  134. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/detectors/__init__.py +0 -0
  135. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/detectors/broken_links/test_broken_links_detector.py +0 -0
  136. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/detectors/conftest.py +0 -0
  137. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/detectors/content/__init__.py +0 -0
  138. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/detectors/custom/__init__.py +0 -0
  139. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/detectors/custom/conftest.py +0 -0
  140. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/detectors/custom/test_invoice_extraction.py +0 -0
  141. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/detectors/custom/test_llm_runner.py +0 -0
  142. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/detectors/custom/test_pipeline_integration.py +0 -0
  143. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/detectors/custom/test_regex_runner.py +0 -0
  144. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/detectors/custom/test_transformer_runners.py +0 -0
  145. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/detectors/pii/__init__.py +0 -0
  146. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/detectors/pii/conftest.py +0 -0
  147. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/detectors/pii/sample_invoice.pdf +0 -0
  148. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/detectors/pii/test_pii_detector.py +0 -0
  149. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/detectors/pii/test_pii_detector_extended.py +0 -0
  150. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/detectors/secrets/__init__.py +0 -0
  151. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/detectors/secrets/test_secrets_detector.py +0 -0
  152. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/detectors/secrets/test_secrets_detector_extended.py +0 -0
  153. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/detectors/test_base_detector.py +0 -0
  154. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/detectors/test_custom_detector_examples_runtime.py +0 -0
  155. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/detectors/test_detector_catalog_commercial.py +0 -0
  156. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/detectors/test_detector_pipeline_types.py +0 -0
  157. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/detectors/test_detector_schema_examples.py +0 -0
  158. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/detectors/test_detector_types.py +0 -0
  159. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/detectors/test_phase2_detectors.py +0 -0
  160. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/detectors/test_registry.py +0 -0
  161. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/detectors/threat/__init__.py +0 -0
  162. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/detectors/threat/test_code_security_detector.py +0 -0
  163. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/detectors/threat/test_yara_detector.py +0 -0
  164. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/integration/test_wordpress_broken_links_detector.py +0 -0
  165. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/integration/test_wordpress_links_assets.py +0 -0
  166. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/pipeline/test_detector_pipeline.py +0 -0
  167. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/pipeline/test_worker_pool.py +0 -0
  168. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_assets_metadata_catalog.py +0 -0
  169. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_azure_blob_storage_source.py +0 -0
  170. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_base_source_attachment.py +0 -0
  171. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_base_source_sampling.py +0 -0
  172. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_config.py +0 -0
  173. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_confluence_source.py +0 -0
  174. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_custom_extractor.py +0 -0
  175. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_databricks_source.py +0 -0
  176. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_dependency_groups.py +0 -0
  177. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_email_source.py +0 -0
  178. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_google_cloud_storage_source.py +0 -0
  179. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_hashing.py +0 -0
  180. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_hive_source.py +0 -0
  181. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_jira_source.py +0 -0
  182. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_mongodb_source.py +0 -0
  183. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_mssql_source.py +0 -0
  184. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_mysql_source.py +0 -0
  185. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_neo4j_source.py +0 -0
  186. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_notion_source.py +0 -0
  187. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_oracle_source.py +0 -0
  188. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_postgresql_source.py +0 -0
  189. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_powerbi_source.py +0 -0
  190. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_recipe_normalizer.py +0 -0
  191. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_s3_compatible_storage_source.py +0 -0
  192. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_sandbox_runner.py +0 -0
  193. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_servicedesk_source.py +0 -0
  194. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_slack_source.py +0 -0
  195. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_snowflake_source.py +0 -0
  196. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_source_dependency_groups.py +0 -0
  197. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_sqlite_source.py +0 -0
  198. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_tableau_source.py +0 -0
  199. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_tabular_utils.py +0 -0
  200. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_uv_sync.py +0 -0
  201. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_wordpress_source.py +0 -0
  202. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_youtube_source.py +0 -0
  203. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/test_youtube_source_integration.py +0 -0
  204. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/utils/test_content_extraction.py +0 -0
  205. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/utils/test_embedded_images.py +0 -0
  206. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/utils/test_file_metadata.py +0 -0
  207. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/utils/test_file_parser.py +0 -0
  208. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/utils/test_file_to_images.py +0 -0
  209. {classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/tests/utils/test_transcription.py +0 -0
@@ -0,0 +1,3 @@
1
+ $ uv sync
2
+ Resolved 265 packages in 164ms
3
+ Checked 50 packages in 1ms
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: classifyre-cli
3
- Version: 0.4.29
3
+ Version: 0.4.31
4
4
  Summary: Classifyre CLI — scan and classify unstructured data sources
5
5
  License: MIT
6
6
  Keywords: data,ingestion,metadata,pii,secrets,unstructured
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@classifyre/cli",
3
- "version": "0.4.29",
3
+ "version": "0.4.31",
4
4
  "private": true,
5
5
  "scripts": {
6
6
  "build": "uv sync",
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "classifyre-cli"
3
- version = "0.4.29"
3
+ version = "0.4.31"
4
4
  description = "Classifyre CLI — scan and classify unstructured data sources"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -366,6 +366,12 @@ async def run_command_async(args: argparse.Namespace, recipe: dict[str, Any]) ->
366
366
  len(all_stubs),
367
367
  )
368
368
 
369
+ # Persist the advanced AUTOMATIC sampling cursor (no-op for
370
+ # other strategies, which return None). Only on the normal
371
+ # completion path — a timed-out run must not advance it.
372
+ if hasattr(sink, "set_sampling_cursor"):
373
+ sink.set_sampling_cursor(source.current_sampling_cursor())
374
+
369
375
  await sink.finish()
370
376
  logger.info(
371
377
  "Extraction completed: %s assets in %s batches",
@@ -85,9 +85,10 @@ class SlackChannelType(StrEnum):
85
85
 
86
86
  class SamplingStrategy(StrEnum):
87
87
  """
88
- Sampling strategy: RANDOM samples items randomly, LATEST prioritises the most recently modified/created items, ALL scans every item with no limit
88
+ Sampling strategy. AUTOMATIC (recommended default) incrementally ingests a new slice of not-yet-seen data on every run, remembering its position between runs and wrapping around to re-scan from the start once everything has been covered — eventually ingesting everything at a bounded cost per run. RANDOM samples items randomly. LATEST prioritises the most recently modified/created items. ALL scans every item with no limit.
89
89
  """
90
90
 
91
+ AUTOMATIC = 'AUTOMATIC'
91
92
  RANDOM = 'RANDOM'
92
93
  LATEST = 'LATEST'
93
94
  ALL = 'ALL'
@@ -95,7 +96,7 @@ class SamplingStrategy(StrEnum):
95
96
 
96
97
  class SamplingConfig(BaseModel):
97
98
  """
98
- Controls how content is extracted from each source. For tabular sources rows_per_page controls both sample size for RANDOM/LATEST and pagination batch size for ALL.
99
+ Controls how content is extracted from each source. For tabular sources rows_per_page controls both sample size for AUTOMATIC/RANDOM/LATEST and pagination batch size for ALL.
99
100
  """
100
101
 
101
102
  model_config = ConfigDict(
@@ -124,7 +125,7 @@ class SamplingConfig(BaseModel):
124
125
  )
125
126
  rows_per_page: int | None = Field(
126
127
  100,
127
- description='Tabular sources only. Number of rows per sample (RANDOM/LATEST) or per pagination batch (ALL). Controls memory usage during large table scans.',
128
+ description='Tabular sources only. Number of rows per sample (AUTOMATIC/RANDOM/LATEST) or per pagination batch (ALL). For AUTOMATIC this is the size of the incremental slice ingested each run. Controls memory usage during large table scans.',
128
129
  ge=10,
129
130
  le=10000,
130
131
  )
@@ -127,6 +127,11 @@ class FinalizeIngestRunRequest(BaseModel):
127
127
 
128
128
  runner_id: str = Field(serialization_alias="runnerId")
129
129
  seen_hashes: list[str] = Field(serialization_alias="seenHashes")
130
+ # AUTOMATIC sampling cursor to persist on the source for the next run.
131
+ # Omitted (None) for other strategies so the stored cursor is left untouched.
132
+ sampling_cursor: dict[str, Any] | None = Field(
133
+ None, serialization_alias="samplingCursor"
134
+ )
130
135
 
131
136
 
132
137
  class UpdateRunnerStatusRequest(BaseModel):
@@ -165,6 +170,11 @@ class RestOutputSink:
165
170
  self.session.mount("https://", adapter)
166
171
  self._runner_id = context.runner_id
167
172
  self._seen_hashes: set[str] = set()
173
+ self._sampling_cursor: dict[str, Any] | None = None
174
+
175
+ def set_sampling_cursor(self, cursor: dict[str, Any] | None) -> None:
176
+ """Record the AUTOMATIC sampling cursor to persist on finalize."""
177
+ self._sampling_cursor = cursor
168
178
 
169
179
  async def start(self) -> None:
170
180
  if not self.context.source_id:
@@ -244,11 +254,12 @@ class RestOutputSink:
244
254
  payload = FinalizeIngestRunRequest(
245
255
  runner_id=runner_id,
246
256
  seen_hashes=sorted(self._seen_hashes),
257
+ sampling_cursor=self._sampling_cursor,
247
258
  )
248
259
  self._request_json(
249
260
  "POST",
250
261
  f"/sources/{source_id}/assets/finalize",
251
- payload.model_dump(mode="json", by_alias=True),
262
+ payload.model_dump(mode="json", by_alias=True, exclude_none=True),
252
263
  )
253
264
 
254
265
  status_payload = UpdateRunnerStatusRequest(status="COMPLETED")
@@ -1,7 +1,11 @@
1
+ import base64
2
+ import json
3
+ import logging
1
4
  import os
5
+ import threading
2
6
  from abc import ABC, abstractmethod
3
7
  from collections.abc import AsyncGenerator, Generator
4
- from typing import TYPE_CHECKING, Any
8
+ from typing import TYPE_CHECKING, Any, TypeVar
5
9
 
6
10
  from ..models.generated_single_asset_scan_results import DetectionResult, SingleAssetScanResults
7
11
  from ..outputs.rest import IngestEdge
@@ -12,6 +16,10 @@ from ..utils.hashing import calculate_checksum, normalize_http_url
12
16
  from ..utils.validation import validate_output
13
17
  from .recipe_normalizer import normalize_source_recipe
14
18
 
19
+ logger = logging.getLogger(__name__)
20
+
21
+ _T = TypeVar("_T")
22
+
15
23
 
16
24
  class BaseSource(ABC):
17
25
  """
@@ -26,6 +34,10 @@ class BaseSource(ABC):
26
34
  # Default batch size for streaming asset results
27
35
  BATCH_SIZE: int = 50
28
36
  HAS_SUCCESSFUL_RUN_ENV = "CLASSIFYRE_SOURCE_HAS_SUCCESSFUL_RUN"
37
+ # The API injects the saved AUTOMATIC sampling cursor here (base64-encoded
38
+ # JSON) before launching the CLI job. The recipe itself cannot carry it
39
+ # because every source schema sets ``additionalProperties: false``.
40
+ SAMPLING_CURSOR_ENV = "CLASSIFYRE_SAMPLING_CURSOR"
29
41
 
30
42
  def __init__(
31
43
  self,
@@ -42,6 +54,11 @@ class BaseSource(ABC):
42
54
  runner_id: Optional runner ID (for API runs)
43
55
  """
44
56
  normalized_recipe = normalize_source_recipe(recipe, recipe.get("type"))
57
+ # Cursor carried over from the previous run (AUTOMATIC strategy). Read
58
+ # before the override hook so subclasses can consult it there if needed.
59
+ self._sampling_cursor: dict[str, Any] = self._load_sampling_cursor()
60
+ self._next_sampling_cursor: dict[str, Any] | None = None
61
+ self._sampling_cursor_lock = threading.Lock()
45
62
  self._apply_initial_sampling_override(normalized_recipe)
46
63
  recipe.clear()
47
64
  recipe.update(normalized_recipe)
@@ -55,6 +72,108 @@ class BaseSource(ABC):
55
72
  def _apply_initial_sampling_override(self, recipe: dict[str, Any]) -> None:
56
73
  pass
57
74
 
75
+ # ── AUTOMATIC sampling cursor ────────────────────────────────────────
76
+ #
77
+ # AUTOMATIC sampling keeps a small, opaque, source-defined cursor in the
78
+ # API between runs. Each run reads the prior cursor (``sampling_cursor``),
79
+ # ingests the next slice of not-yet-seen data, then records the advanced
80
+ # cursor (``set_next_sampling_cursor``). The output sink persists it back to
81
+ # the API on finalize via ``current_sampling_cursor``. When a source has
82
+ # ingested everything it should reset the cursor so the next run wraps
83
+ # around and re-ingests from the start (data is not stale).
84
+
85
+ def _load_sampling_cursor(self) -> dict[str, Any]:
86
+ raw = os.environ.get(self.SAMPLING_CURSOR_ENV)
87
+ if not raw:
88
+ return {}
89
+ try:
90
+ decoded = base64.b64decode(raw).decode("utf-8")
91
+ data = json.loads(decoded)
92
+ except Exception as exc:
93
+ logger.warning("Ignoring malformed %s: %s", self.SAMPLING_CURSOR_ENV, exc)
94
+ return {}
95
+ return data if isinstance(data, dict) else {}
96
+
97
+ def sampling_cursor(self) -> dict[str, Any]:
98
+ """Return the cursor saved by the previous run (empty on first run)."""
99
+ return self._sampling_cursor
100
+
101
+ def set_next_sampling_cursor(self, cursor: dict[str, Any]) -> None:
102
+ """Record the advanced cursor to persist at the end of this run."""
103
+ self._next_sampling_cursor = cursor
104
+
105
+ def current_sampling_cursor(self) -> dict[str, Any] | None:
106
+ """Cursor to persist for the next run, or None to leave it unchanged.
107
+
108
+ Returns None unless this run advanced the cursor (i.e. AUTOMATIC
109
+ sampling actually ran), so non-AUTOMATIC runs never touch the stored
110
+ cursor.
111
+ """
112
+ return self._next_sampling_cursor
113
+
114
+ def sampling_window_size(self, default: int = 100) -> int:
115
+ """The per-run AUTOMATIC slice size (``rows_per_page``)."""
116
+ config = getattr(self, "config", None)
117
+ sampling = getattr(config, "sampling", None) if config is not None else None
118
+ size = getattr(sampling, "rows_per_page", None)
119
+ try:
120
+ return int(size) if size else default
121
+ except (TypeError, ValueError):
122
+ return default
123
+
124
+ def _record_cursor_key(self, key: str, value: Any) -> None:
125
+ """Thread-safely set ``key`` in the cursor to persist for the next run."""
126
+ with self._sampling_cursor_lock:
127
+ nxt = self._next_sampling_cursor if isinstance(self._next_sampling_cursor, dict) else {}
128
+ nxt = {**nxt, key: value}
129
+ self._next_sampling_cursor = nxt
130
+
131
+ def automatic_offset(self, key: str) -> int:
132
+ """Return the saved offset for a keyed AUTOMATIC DB cursor (0 on first run)."""
133
+ saved = self._sampling_cursor.get(key)
134
+ return saved if isinstance(saved, int) and saved >= 0 else 0
135
+
136
+ def record_automatic_offset(
137
+ self, key: str, *, prev_offset: int, fetched: int
138
+ ) -> None:
139
+ """Advance a keyed offset cursor; wrap to 0 once a page underfills.
140
+
141
+ Used by sources that page rows directly from the backing store
142
+ (``skip``/``OFFSET``) rather than materialising a full list.
143
+ """
144
+ size = self.sampling_window_size()
145
+ next_offset = 0 if fetched < size else prev_offset + fetched
146
+ self._record_cursor_key(key, next_offset)
147
+
148
+ def automatic_window(self, items: list[_T], *, key: str = "items") -> list[_T]:
149
+ """Return the next AUTOMATIC slice of a stably-ordered in-memory list.
150
+
151
+ Non-tabular sources fetch a list of item references, then call this to
152
+ ingest only the next ``rows_per_page`` window. A per-``key`` offset is
153
+ remembered between runs and wraps back to the start once the list has
154
+ been fully covered (data is not stale, so re-ingesting is desired).
155
+
156
+ Callers must pass the items in a **stable order** across runs (e.g. by
157
+ id or timestamp) so the cursor stays meaningful.
158
+ """
159
+ total = len(items)
160
+ if total == 0:
161
+ return []
162
+
163
+ saved = self._sampling_cursor.get(key)
164
+ offset = saved if isinstance(saved, int) and 0 <= saved < total else 0
165
+
166
+ size = self.sampling_window_size()
167
+ window = items[offset : offset + size]
168
+
169
+ next_offset = offset + len(window)
170
+ if next_offset >= total:
171
+ next_offset = 0 # wrap around on the next run
172
+
173
+ self._record_cursor_key(key, next_offset)
174
+
175
+ return window
176
+
58
177
  @staticmethod
59
178
  def _read_bool_env(name: str) -> bool | None:
60
179
  raw = os.environ.get(name)
@@ -242,11 +242,24 @@ class ConfluenceSource(BaseSource):
242
242
  params["labels"] = ",".join(str(v) for v in spaces_filter.labels)
243
243
  return self.client.iter_confluence_results("/wiki/api/v2/spaces", params=params)
244
244
 
245
+ def _sorted_page_refs(self, refs: list[dict[str, Any]]) -> list[dict[str, Any]]:
246
+ return sorted(
247
+ refs,
248
+ key=lambda ref: parse_datetime(
249
+ str(ref.get("version_created_at") or ref.get("created_at") or "")
250
+ ),
251
+ reverse=True,
252
+ )
253
+
245
254
  def _sample_page_refs(self, refs: list[dict[str, Any]]) -> list[dict[str, Any]]:
246
255
  sampling = self.config.sampling
247
256
  if sampling.strategy == SamplingStrategy.ALL:
248
257
  return refs
249
258
 
259
+ if sampling.strategy == SamplingStrategy.AUTOMATIC:
260
+ # Newest-first stable order; window advances each run and wraps around.
261
+ return self.automatic_window(self._sorted_page_refs(refs), key="pages")
262
+
250
263
  limit = int(sampling.rows_per_page or 100)
251
264
  if limit >= len(refs):
252
265
  return refs
@@ -254,14 +267,7 @@ class ConfluenceSource(BaseSource):
254
267
  if sampling.strategy == SamplingStrategy.RANDOM:
255
268
  return deterministic_sample(refs, limit)
256
269
 
257
- refs_sorted = sorted(
258
- refs,
259
- key=lambda ref: parse_datetime(
260
- str(ref.get("version_created_at") or ref.get("created_at") or "")
261
- ),
262
- reverse=True,
263
- )
264
- return refs_sorted[:limit]
270
+ return self._sorted_page_refs(refs)[:limit]
265
271
 
266
272
  def _extract_page_assets(self, ref: dict[str, Any]) -> list[SingleAssetScanResults]:
267
273
  page_id = str(ref["page_id"])
@@ -423,6 +423,11 @@ class DatabricksSource(BaseTabularSource):
423
423
  return value.isoformat()
424
424
  return str(value)
425
425
 
426
+ def _automatic_supports_keyset(self) -> bool:
427
+ # Databricks builds inline (parameter-less) queries; AUTOMATIC uses OFFSET
428
+ # paging through _fetch_one_page rather than keyset WHERE clauses.
429
+ return False
430
+
426
431
  # ── Databricks pagination (inline LIMIT/OFFSET) ──────────────────────
427
432
 
428
433
  def _fetch_one_page(
@@ -195,6 +195,11 @@ class EmailSource(BaseSource):
195
195
  total = 0
196
196
 
197
197
  try:
198
+ if strategy == SamplingStrategy.AUTOMATIC:
199
+ async for batch in self._extract_automatic(mod, criteria):
200
+ yield batch
201
+ return
202
+
198
203
  for folder in self.folders:
199
204
  if self._aborted or (limit is not None and total >= limit):
200
205
  break
@@ -236,6 +241,63 @@ class EmailSource(BaseSource):
236
241
  finally:
237
242
  logger.info("Extracted %s email messages", total)
238
243
 
244
+ async def _extract_automatic(
245
+ self, mod: Any, criteria: Any
246
+ ) -> AsyncGenerator[list[SingleAssetScanResults], None]:
247
+ """AUTOMATIC sampling: page through each folder's messages by UID.
248
+
249
+ Listing UIDs is cheap (no body fetch); we window the UID list (newest
250
+ first) so each run ingests the next ``rows_per_page`` slice per folder
251
+ and wraps around once the folder has been fully covered.
252
+ """
253
+ pending: list[SingleAssetScanResults] = []
254
+ total = 0
255
+ for folder in self.folders:
256
+ if self._aborted:
257
+ break
258
+ try:
259
+ self._mailbox.folder.set(folder)
260
+ except Exception as e:
261
+ logger.warning("Skipping folder %s: %s", folder, e)
262
+ continue
263
+
264
+ try:
265
+ uid_ints = sorted((int(u) for u in self._mailbox.uids(criteria)), reverse=True)
266
+ except Exception as e:
267
+ logger.warning("Could not list UIDs for folder %s: %s", folder, e)
268
+ continue
269
+ if not uid_ints:
270
+ continue
271
+
272
+ window = self.automatic_window([str(u) for u in uid_ints], key=f"folder:{folder}")
273
+ if not window:
274
+ continue
275
+
276
+ for msg in self._mailbox.fetch(
277
+ mod.AND(uid=",".join(window)),
278
+ mark_seen=False,
279
+ bulk=self.BATCH_SIZE,
280
+ ):
281
+ if self._aborted:
282
+ break
283
+ try:
284
+ assets = self._message_to_assets(msg, folder)
285
+ except Exception as e:
286
+ logger.error(
287
+ "Failed to transform message uid=%s: %s", getattr(msg, "uid", "?"), e
288
+ )
289
+ continue
290
+ for asset in assets:
291
+ pending.append(asset)
292
+ while len(pending) >= self.BATCH_SIZE:
293
+ yield pending[: self.BATCH_SIZE]
294
+ pending = pending[self.BATCH_SIZE :]
295
+ total += 1
296
+
297
+ if pending:
298
+ yield pending
299
+ logger.info("Extracted %s email messages (AUTOMATIC)", total)
300
+
239
301
  def _message_to_assets(self, msg: Any, folder: str) -> list[SingleAssetScanResults]:
240
302
  message_id = self._message_id(msg, folder)
241
303
  email_hash = self.generate_hash_id(message_id)
@@ -193,11 +193,28 @@ class JiraSource(BaseSource):
193
193
  return f"{query} ORDER BY updated DESC"
194
194
  return query
195
195
 
196
+ def _sorted_issues(self, issues: list[dict[str, Any]]) -> list[dict[str, Any]]:
197
+ return sorted(
198
+ issues,
199
+ key=lambda issue: parse_datetime(
200
+ str(
201
+ issue.get("fields", {}).get("updated")
202
+ if isinstance(issue.get("fields"), dict)
203
+ else ""
204
+ )
205
+ ),
206
+ reverse=True,
207
+ )
208
+
196
209
  def _sample_issues(self, issues: list[dict[str, Any]]) -> list[dict[str, Any]]:
197
210
  sampling = self.config.sampling
198
211
  if sampling.strategy == SamplingStrategy.ALL:
199
212
  return issues
200
213
 
214
+ if sampling.strategy == SamplingStrategy.AUTOMATIC:
215
+ # Newest-first stable order; window advances each run and wraps around.
216
+ return self.automatic_window(self._sorted_issues(issues), key="issues")
217
+
201
218
  limit = int(sampling.rows_per_page or 100)
202
219
  if limit >= len(issues):
203
220
  return issues
@@ -205,18 +222,7 @@ class JiraSource(BaseSource):
205
222
  if sampling.strategy == SamplingStrategy.RANDOM:
206
223
  return deterministic_sample(issues, limit)
207
224
 
208
- sorted_issues = sorted(
209
- issues,
210
- key=lambda issue: parse_datetime(
211
- str(
212
- issue.get("fields", {}).get("updated")
213
- if isinstance(issue.get("fields"), dict)
214
- else ""
215
- )
216
- ),
217
- reverse=True,
218
- )
219
- return sorted_issues[:limit]
225
+ return self._sorted_issues(issues)[:limit]
220
226
 
221
227
  def _extract_issue_assets(self, issue: dict[str, Any]) -> list[SingleAssetScanResults]:
222
228
  fields = issue.get("fields", {})
@@ -407,6 +407,14 @@ class MongoDBSource(BaseSource):
407
407
  if strategy == SamplingStrategy.ALL:
408
408
  return list(collection.find({}).limit(rows_per_page))
409
409
 
410
+ if strategy == SamplingStrategy.AUTOMATIC:
411
+ # Page forward through the collection each run; wrap when exhausted.
412
+ key = f"collection:{collection_ref.database}.{collection_ref.collection}"
413
+ offset = self.automatic_offset(key)
414
+ documents = list(collection.find({}).skip(offset).limit(rows_per_page))
415
+ self.record_automatic_offset(key, prev_offset=offset, fetched=len(documents))
416
+ return documents
417
+
410
418
  if strategy == SamplingStrategy.RANDOM:
411
419
  return self._sample_random_documents(collection, rows_per_page)
412
420
 
@@ -392,6 +392,14 @@ class Neo4jSource(BaseSource):
392
392
  strategy = sampling.strategy
393
393
  rows = int(sampling.rows_per_page or 100)
394
394
 
395
+ if strategy == SamplingStrategy.AUTOMATIC:
396
+ # Page forward through this label's nodes each run; wrap when exhausted.
397
+ key = f"label:{ref.label}"
398
+ offset = self.automatic_offset(key)
399
+ page = self._fetch_nodes_page(ref, skip=offset, limit=rows)
400
+ self.record_automatic_offset(key, prev_offset=offset, fetched=len(page))
401
+ return page
402
+
395
403
  if strategy == SamplingStrategy.RANDOM:
396
404
  cypher = (
397
405
  f"MATCH (n:{_escape_label(ref.label)}) "
@@ -338,11 +338,22 @@ class NotionSource(BaseSource):
338
338
  "edited": obj.get("last_edited_time") or obj.get("created_time") or "",
339
339
  }
340
340
 
341
+ def _sorted_refs(self, refs: list[dict[str, Any]]) -> list[dict[str, Any]]:
342
+ return sorted(
343
+ refs,
344
+ key=lambda ref: parse_datetime(str(ref.get("edited") or "")),
345
+ reverse=True,
346
+ )
347
+
341
348
  def _sample_refs(self, refs: list[dict[str, Any]]) -> list[dict[str, Any]]:
342
349
  sampling = self.config.sampling
343
350
  if sampling.strategy == SamplingStrategy.ALL:
344
351
  return refs
345
352
 
353
+ if sampling.strategy == SamplingStrategy.AUTOMATIC:
354
+ # Newest-first stable order; window advances each run and wraps around.
355
+ return self.automatic_window(self._sorted_refs(refs), key="refs")
356
+
346
357
  limit = int(sampling.rows_per_page or 100)
347
358
  if limit >= len(refs):
348
359
  return refs
@@ -350,12 +361,7 @@ class NotionSource(BaseSource):
350
361
  if sampling.strategy == SamplingStrategy.RANDOM:
351
362
  return deterministic_sample(refs, limit)
352
363
 
353
- refs_sorted = sorted(
354
- refs,
355
- key=lambda ref: parse_datetime(str(ref.get("edited") or "")),
356
- reverse=True,
357
- )
358
- return refs_sorted[:limit]
364
+ return self._sorted_refs(refs)[:limit]
359
365
 
360
366
  # ------------------------------------------------------------------- pages
361
367
  def _extract_page_assets(self, page: dict[str, Any]) -> list[SingleAssetScanResults]:
@@ -271,6 +271,11 @@ class ObjectStorageSourceBase(BaseSource, ABC):
271
271
 
272
272
  materialized = list(refs)
273
273
 
274
+ if strategy == SamplingStrategy.AUTOMATIC:
275
+ # Newest-first stable order; window advances each run and wraps around.
276
+ materialized.sort(key=lambda ref: ref.last_modified, reverse=True)
277
+ return self.automatic_window(materialized, key="objects")
278
+
274
279
  if strategy == SamplingStrategy.RANDOM:
275
280
  if limit >= len(materialized):
276
281
  return materialized
@@ -569,11 +569,28 @@ class PowerBISource(BaseSource):
569
569
  return parsed
570
570
  return None
571
571
 
572
+ def _ordered_refs_for_automatic(
573
+ self, refs: list[PowerBIAssetRef], order_field: str
574
+ ) -> list[PowerBIAssetRef]:
575
+ values = [self._sampling_sort_datetime(ref, order_field) for ref in refs]
576
+ scored: list[tuple[bool, datetime, PowerBIAssetRef]] = []
577
+ for ref, parsed in zip(refs, values, strict=False):
578
+ effective = parsed or ref.updated_at
579
+ scored.append((parsed is not None, effective, ref))
580
+ scored.sort(key=lambda item: (item[0], item[1]), reverse=True)
581
+ return [item[2] for item in scored]
582
+
572
583
  def _sample_refs(self, refs: list[PowerBIAssetRef]) -> list[PowerBIAssetRef]:
573
584
  sampling = self._sampling()
574
585
  if sampling.strategy == SamplingStrategy.ALL:
575
586
  return refs
576
587
 
588
+ if sampling.strategy == SamplingStrategy.AUTOMATIC:
589
+ # Newest-first stable order; window advances each run and wraps around.
590
+ order_field = sampling.order_by_column or "modifiedDateTime"
591
+ ordered = self._ordered_refs_for_automatic(refs, order_field)
592
+ return self.automatic_window(ordered, key="refs")
593
+
577
594
  if sampling.strategy == SamplingStrategy.RANDOM:
578
595
  limit = int(sampling.rows_per_page or 100)
579
596
  if limit >= len(refs):
@@ -3,7 +3,7 @@ from __future__ import annotations
3
3
  from copy import deepcopy
4
4
  from typing import Any
5
5
 
6
- _VALID_SAMPLING_STRATEGIES = {"RANDOM", "LATEST", "ALL"}
6
+ _VALID_SAMPLING_STRATEGIES = {"AUTOMATIC", "RANDOM", "LATEST", "ALL"}
7
7
 
8
8
 
9
9
  def _as_dict(value: Any) -> dict[str, Any]:
@@ -130,7 +130,7 @@ def normalize_source_recipe(
130
130
  _normalize_sampling_strategy(sampling.get("strategy")),
131
131
  _normalize_sampling_strategy(optional_sampling.get("strategy")),
132
132
  _normalize_sampling_strategy(optional_sampling.get("mode")),
133
- "RANDOM",
133
+ "AUTOMATIC",
134
134
  )
135
135
 
136
136
  sampling["strategy"] = strategy
@@ -204,6 +204,11 @@ class ServiceDeskSource(BaseSource):
204
204
  if sampling.strategy == SamplingStrategy.ALL:
205
205
  return requests
206
206
 
207
+ if sampling.strategy == SamplingStrategy.AUTOMATIC:
208
+ # Newest-first stable order; window advances each run and wraps around.
209
+ sorted_requests = sorted(requests, key=self._request_sort_timestamp, reverse=True)
210
+ return self.automatic_window(sorted_requests, key="requests")
211
+
207
212
  limit = int(sampling.rows_per_page or 100)
208
213
  if limit >= len(requests):
209
214
  return requests