classifyre-cli 0.4.18__tar.gz → 0.4.20__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (206) hide show
  1. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/.turbo/turbo-build.log +1 -1
  2. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/PKG-INFO +1 -1
  3. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/package.json +1 -1
  4. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/pyproject.toml +13 -1
  5. classifyre_cli-0.4.20/src/config.py +76 -0
  6. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/models/generated_input.py +138 -41
  7. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/base.py +8 -0
  8. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/object_storage/base.py +10 -8
  9. classifyre_cli-0.4.20/src/sources/youtube/__init__.py +3 -0
  10. classifyre_cli-0.4.20/src/sources/youtube/source.py +589 -0
  11. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/utils/dependency_groups.py +6 -2
  12. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/utils/file_parser.py +65 -4
  13. classifyre_cli-0.4.20/src/utils/transcription.py +177 -0
  14. classifyre_cli-0.4.20/tests/test_config.py +64 -0
  15. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_dependency_groups.py +9 -0
  16. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_s3_compatible_storage_source.py +2 -0
  17. classifyre_cli-0.4.20/tests/test_source_dependency_groups.py +266 -0
  18. classifyre_cli-0.4.20/tests/test_youtube_source.py +247 -0
  19. classifyre_cli-0.4.20/tests/test_youtube_source_integration.py +77 -0
  20. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/utils/test_file_parser.py +76 -0
  21. classifyre_cli-0.4.20/tests/utils/test_transcription.py +92 -0
  22. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/uv.lock +391 -175
  23. classifyre_cli-0.4.18/tests/test_source_dependency_groups.py +0 -74
  24. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/.gitignore +0 -0
  25. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/.python-version +0 -0
  26. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/README.md +0 -0
  27. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/main.py +0 -0
  28. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/scripts/generate_models.py +0 -0
  29. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/__init__.py +0 -0
  30. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/detectors/__init__.py +0 -0
  31. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/detectors/base.py +0 -0
  32. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/detectors/broken_links/__init__.py +0 -0
  33. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/detectors/broken_links/detector.py +0 -0
  34. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/detectors/config.py +0 -0
  35. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/detectors/content/__init__.py +0 -0
  36. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/detectors/custom/__init__.py +0 -0
  37. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/detectors/custom/detector.py +0 -0
  38. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/detectors/custom/extractor.py +0 -0
  39. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/detectors/custom/runners/__init__.py +0 -0
  40. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/detectors/custom/runners/_base.py +0 -0
  41. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/detectors/custom/runners/_factory.py +0 -0
  42. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/detectors/custom/runners/_feature_extraction.py +0 -0
  43. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/detectors/custom/runners/_gliner2.py +0 -0
  44. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/detectors/custom/runners/_image_classification.py +0 -0
  45. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/detectors/custom/runners/_llm.py +0 -0
  46. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/detectors/custom/runners/_object_detection.py +0 -0
  47. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/detectors/custom/runners/_regex.py +0 -0
  48. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/detectors/custom/runners/_text_classification.py +0 -0
  49. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/detectors/custom/trainer.py +0 -0
  50. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/detectors/dependencies.py +0 -0
  51. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/detectors/pii/__init__.py +0 -0
  52. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/detectors/pii/detector.py +0 -0
  53. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/detectors/secrets/__init__.py +0 -0
  54. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/detectors/secrets/detector.py +0 -0
  55. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/detectors/threat/__init__.py +0 -0
  56. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/detectors/threat/code_security_detector.py +0 -0
  57. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/detectors/threat/yara_detector.py +0 -0
  58. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/main.py +0 -0
  59. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/models/generated_detectors.py +0 -0
  60. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/models/generated_single_asset_scan_results.py +0 -0
  61. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/outputs/__init__.py +0 -0
  62. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/outputs/base.py +0 -0
  63. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/outputs/console.py +0 -0
  64. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/outputs/factory.py +0 -0
  65. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/outputs/file.py +0 -0
  66. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/outputs/rest.py +0 -0
  67. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/pipeline/__init__.py +0 -0
  68. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/pipeline/content_provider.py +0 -0
  69. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/pipeline/detector_pipeline.py +0 -0
  70. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/pipeline/parsed_content_provider.py +0 -0
  71. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/pipeline/worker_pool.py +0 -0
  72. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sandbox/__init__.py +0 -0
  73. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sandbox/runner.py +0 -0
  74. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/__init__.py +0 -0
  75. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/asset_metadata.py +0 -0
  76. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/atlassian_common.py +0 -0
  77. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/azure_blob_storage/__init__.py +0 -0
  78. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/azure_blob_storage/source.py +0 -0
  79. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/confluence/__init__.py +0 -0
  80. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/confluence/source.py +0 -0
  81. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/databricks/__init__.py +0 -0
  82. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/databricks/source.py +0 -0
  83. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/dependencies.py +0 -0
  84. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/email/__init__.py +0 -0
  85. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/email/source.py +0 -0
  86. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/google_cloud_storage/__init__.py +0 -0
  87. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/google_cloud_storage/source.py +0 -0
  88. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/hive/__init__.py +0 -0
  89. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/hive/source.py +0 -0
  90. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/jira/__init__.py +0 -0
  91. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/jira/source.py +0 -0
  92. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/mongodb/__init__.py +0 -0
  93. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/mongodb/source.py +0 -0
  94. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/mssql/__init__.py +0 -0
  95. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/mssql/source.py +0 -0
  96. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/mysql/__init__.py +0 -0
  97. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/mysql/source.py +0 -0
  98. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/neo4j/__init__.py +0 -0
  99. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/neo4j/source.py +0 -0
  100. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/notion/__init__.py +0 -0
  101. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/notion/client.py +0 -0
  102. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/notion/source.py +0 -0
  103. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/oracle/__init__.py +0 -0
  104. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/oracle/source.py +0 -0
  105. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/postgresql/__init__.py +0 -0
  106. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/postgresql/source.py +0 -0
  107. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/powerbi/__init__.py +0 -0
  108. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/powerbi/source.py +0 -0
  109. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/recipe_normalizer.py +0 -0
  110. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/s3_compatible_storage/README.md +0 -0
  111. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/s3_compatible_storage/__init__.py +0 -0
  112. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/s3_compatible_storage/source.py +0 -0
  113. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/servicedesk/__init__.py +0 -0
  114. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/servicedesk/source.py +0 -0
  115. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/slack/__init__.py +0 -0
  116. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/slack/source.py +0 -0
  117. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/snowflake/__init__.py +0 -0
  118. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/snowflake/source.py +0 -0
  119. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/sqlite/__init__.py +0 -0
  120. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/sqlite/source.py +0 -0
  121. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/tableau/__init__.py +0 -0
  122. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/tableau/source.py +0 -0
  123. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/tabular_base.py +0 -0
  124. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/tabular_utils.py +0 -0
  125. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/wordpress/__init__.py +0 -0
  126. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/sources/wordpress/source.py +0 -0
  127. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/telemetry.py +0 -0
  128. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/utils/__init__.py +0 -0
  129. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/utils/content_extraction.py +0 -0
  130. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/utils/embedded_images.py +0 -0
  131. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/utils/file_metadata.py +0 -0
  132. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/utils/file_to_images.py +0 -0
  133. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/utils/hashing.py +0 -0
  134. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/utils/uv_sync.py +0 -0
  135. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/src/utils/validation.py +0 -0
  136. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/__init__.py +0 -0
  137. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/conftest.py +0 -0
  138. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/detectors/__init__.py +0 -0
  139. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/detectors/broken_links/test_broken_links_detector.py +0 -0
  140. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/detectors/conftest.py +0 -0
  141. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/detectors/content/__init__.py +0 -0
  142. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/detectors/custom/__init__.py +0 -0
  143. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/detectors/custom/conftest.py +0 -0
  144. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/detectors/custom/test_invoice_extraction.py +0 -0
  145. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/detectors/custom/test_llm_runner.py +0 -0
  146. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/detectors/custom/test_pipeline_integration.py +0 -0
  147. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/detectors/custom/test_regex_runner.py +0 -0
  148. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/detectors/custom/test_transformer_runners.py +0 -0
  149. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/detectors/pii/__init__.py +0 -0
  150. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/detectors/pii/conftest.py +0 -0
  151. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/detectors/pii/sample_invoice.pdf +0 -0
  152. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/detectors/pii/test_pii_detector.py +0 -0
  153. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/detectors/pii/test_pii_detector_extended.py +0 -0
  154. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/detectors/secrets/__init__.py +0 -0
  155. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/detectors/secrets/test_secrets_detector.py +0 -0
  156. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/detectors/secrets/test_secrets_detector_extended.py +0 -0
  157. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/detectors/test_base_detector.py +0 -0
  158. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/detectors/test_custom_detector_examples_runtime.py +0 -0
  159. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/detectors/test_detector_catalog_commercial.py +0 -0
  160. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/detectors/test_detector_pipeline_types.py +0 -0
  161. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/detectors/test_detector_schema_examples.py +0 -0
  162. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/detectors/test_detector_types.py +0 -0
  163. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/detectors/test_phase2_detectors.py +0 -0
  164. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/detectors/test_registry.py +0 -0
  165. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/detectors/threat/__init__.py +0 -0
  166. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/detectors/threat/test_code_security_detector.py +0 -0
  167. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/detectors/threat/test_yara_detector.py +0 -0
  168. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/integration/test_wordpress_broken_links_detector.py +0 -0
  169. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/integration/test_wordpress_links_assets.py +0 -0
  170. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/pipeline/test_detector_pipeline.py +0 -0
  171. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/pipeline/test_worker_pool.py +0 -0
  172. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_assets_metadata_catalog.py +0 -0
  173. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_azure_blob_storage_source.py +0 -0
  174. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_base_source_attachment.py +0 -0
  175. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_base_source_sampling.py +0 -0
  176. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_confluence_source.py +0 -0
  177. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_custom_extractor.py +0 -0
  178. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_databricks_source.py +0 -0
  179. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_email_source.py +0 -0
  180. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_google_cloud_storage_source.py +0 -0
  181. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_hashing.py +0 -0
  182. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_hive_source.py +0 -0
  183. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_jira_source.py +0 -0
  184. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_mongodb_source.py +0 -0
  185. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_mssql_source.py +0 -0
  186. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_mysql_source.py +0 -0
  187. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_neo4j_source.py +0 -0
  188. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_notion_source.py +0 -0
  189. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_oracle_source.py +0 -0
  190. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_outputs.py +0 -0
  191. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_postgresql_source.py +0 -0
  192. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_powerbi_source.py +0 -0
  193. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_recipe_normalizer.py +0 -0
  194. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_sandbox_runner.py +0 -0
  195. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_servicedesk_source.py +0 -0
  196. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_slack_source.py +0 -0
  197. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_snowflake_source.py +0 -0
  198. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_sqlite_source.py +0 -0
  199. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_tableau_source.py +0 -0
  200. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_tabular_utils.py +0 -0
  201. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_uv_sync.py +0 -0
  202. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/test_wordpress_source.py +0 -0
  203. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/utils/test_content_extraction.py +0 -0
  204. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/utils/test_embedded_images.py +0 -0
  205. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/utils/test_file_metadata.py +0 -0
  206. {classifyre_cli-0.4.18 → classifyre_cli-0.4.20}/tests/utils/test_file_to_images.py +0 -0
@@ -1,3 +1,3 @@
1
1
  $ uv sync
2
- Resolved 256 packages in 167ms
2
+ Resolved 262 packages in 282ms
3
3
  Checked 50 packages in 1ms
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: classifyre-cli
3
- Version: 0.4.18
3
+ Version: 0.4.20
4
4
  Summary: Classifyre CLI — scan and classify unstructured data sources
5
5
  License: MIT
6
6
  Keywords: data,ingestion,metadata,pii,secrets,unstructured
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@classifyre/cli",
3
- "version": "0.4.18",
3
+ "version": "0.4.20",
4
4
  "private": true,
5
5
  "scripts": {
6
6
  "build": "uv sync",
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "classifyre-cli"
3
- version = "0.4.18"
3
+ version = "0.4.20"
4
4
  description = "Classifyre CLI — scan and classify unstructured data sources"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -125,6 +125,7 @@ detectors = [
125
125
  { include-group = "custom" },
126
126
  { include-group = "regex" },
127
127
  { include-group = "llm" },
128
+ { include-group = "transcription" },
128
129
  ]
129
130
  file-processing = [
130
131
  "filetype>=1.2.0",
@@ -134,6 +135,11 @@ file-processing = [
134
135
  "chardet>=7.4.3",
135
136
  "pyarrow>=18.0.0",
136
137
  ]
138
+ transcription = [
139
+ # CPU audio/video transcription. faster-whisper decodes media via bundled
140
+ # PyAV (no system ffmpeg required) and runs ctranslate2 for inference.
141
+ "faster-whisper>=1.1.0",
142
+ ]
137
143
  ocr = [
138
144
  { include-group = "file-processing" },
139
145
  "docling>=2.94.0",
@@ -192,6 +198,10 @@ google-cloud-storage = [
192
198
  email = [
193
199
  "imap-tools>=1.10.0,<2.0.0",
194
200
  ]
201
+ youtube = [
202
+ "yt-dlp>=2025.1.0",
203
+ "youtube-transcript-api>=1.0.0",
204
+ ]
195
205
  otel = [
196
206
  "opentelemetry-sdk>=1.42.0",
197
207
  "opentelemetry-exporter-otlp-proto-http>=1.27.0",
@@ -309,6 +319,8 @@ module = [
309
319
  "tldextract.*",
310
320
  "re2",
311
321
  "re2.*",
322
+ "faster_whisper",
323
+ "faster_whisper.*",
312
324
  ]
313
325
  ignore_missing_imports = true
314
326
 
@@ -0,0 +1,76 @@
1
+ """Runtime configuration loaded from environment variables.
2
+
3
+ A central, reusable place for tunables that may be overridden via `.env`
4
+ (loaded by ``src.main.load_local_env``) without touching source recipes. Each
5
+ concrete config section is a small pydantic model with a cached accessor so the
6
+ environment is read once per process.
7
+
8
+ Future sources/processors can add their own sections here following the same
9
+ pattern (``BaseModel`` + ``functools.lru_cache`` accessor).
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import os
15
+ from functools import lru_cache
16
+
17
+ from pydantic import BaseModel, Field
18
+
19
+
20
+ def _env_str(name: str, default: str) -> str:
21
+ value = os.environ.get(name)
22
+ if value is None:
23
+ return default
24
+ value = value.strip()
25
+ return value or default
26
+
27
+
28
+ def _env_int(name: str, default: int) -> int:
29
+ raw = os.environ.get(name)
30
+ if raw is None or not raw.strip():
31
+ return default
32
+ try:
33
+ return int(raw.strip())
34
+ except ValueError:
35
+ return default
36
+
37
+
38
+ def _env_bool(name: str, default: bool) -> bool:
39
+ raw = os.environ.get(name)
40
+ if raw is None or not raw.strip():
41
+ return default
42
+ return raw.strip().lower() in {"1", "true", "yes", "on"}
43
+
44
+
45
+ class WhisperConfig(BaseModel):
46
+ """faster-whisper transcription settings (CPU-only defaults).
47
+
48
+ Overridable via environment so a deployment can trade speed for accuracy
49
+ (e.g. a larger model, GPU device, or float16 compute) without code changes.
50
+ """
51
+
52
+ model: str = Field(
53
+ "medium", description="Whisper model size or path (e.g. tiny, base, medium, large-v3)."
54
+ )
55
+ device: str = Field("cpu", description="Inference device: cpu, cuda, or auto.")
56
+ compute_type: str = Field(
57
+ "int8", description="ctranslate2 compute type: int8, int8_float16, float16, float32."
58
+ )
59
+ beam_size: int = Field(5, ge=1, description="Beam search width.")
60
+ vad_filter: bool = Field(
61
+ True, description="Drop non-speech segments with Silero VAD before decoding."
62
+ )
63
+ word_timestamps: bool = Field(True, description="Emit per-word timestamps during decoding.")
64
+
65
+
66
+ @lru_cache(maxsize=1)
67
+ def get_whisper_config() -> WhisperConfig:
68
+ """Return the process-wide WhisperConfig, populated from the environment."""
69
+ return WhisperConfig(
70
+ model=_env_str("CLASSIFYRE_WHISPER_MODEL", "medium"),
71
+ device=_env_str("CLASSIFYRE_WHISPER_DEVICE", "cpu"),
72
+ compute_type=_env_str("CLASSIFYRE_WHISPER_COMPUTE_TYPE", "int8"),
73
+ beam_size=_env_int("CLASSIFYRE_WHISPER_BEAM_SIZE", 5),
74
+ vad_filter=_env_bool("CLASSIFYRE_WHISPER_VAD_FILTER", True),
75
+ word_timestamps=_env_bool("CLASSIFYRE_WHISPER_WORD_TIMESTAMPS", True),
76
+ )
@@ -44,15 +44,7 @@ class AssetType(StrEnum):
44
44
  SQLITE = 'SQLITE'
45
45
  NOTION = 'NOTION'
46
46
  EMAIL = 'EMAIL'
47
-
48
-
49
- class SourceCategory(StrEnum):
50
- """
51
- Category of the source: TABULAR for structured databases (PostgreSQL, MySQL, MSSQL, Oracle, Hive, Databricks Unity Catalog, Snowflake), UNSTRUCTURED for text/web/document sources (WordPress, S3-Compatible Storage, Azure Blob Storage, Google Cloud Storage, Slack, MongoDB, PowerBI, Tableau, Confluence, Jira, Service Desk)
52
- """
53
-
54
- TABULAR = 'TABULAR'
55
- UNSTRUCTURED = 'UNSTRUCTURED'
47
+ YOUTUBE = 'YOUTUBE'
56
48
 
57
49
 
58
50
  class DetectorType(StrEnum):
@@ -114,6 +106,10 @@ class SamplingConfig(BaseModel):
114
106
  False,
115
107
  description='When true, enable OCR/text extraction for supported binary documents and images before routing text-capable detectors.',
116
108
  )
109
+ enable_transcription: bool | None = Field(
110
+ False,
111
+ description='When true, transcribe audio and video files to text (via faster-whisper) before routing text-capable detectors. Slower and requires the transcription dependency.',
112
+ )
117
113
  order_by_column: str | None = Field(
118
114
  None,
119
115
  description='Column to use for LATEST sampling mode in tabular sources (usually created_at/updated_at). Auto-detected when not set.',
@@ -318,6 +314,115 @@ class EmailOptional(BaseModel):
318
314
  scope: EmailOptionalScope | None = None
319
315
 
320
316
 
317
+ class Type(StrEnum):
318
+ """
319
+ Type of the asset or source
320
+ """
321
+
322
+ WORDPRESS = 'WORDPRESS'
323
+ SLACK = 'SLACK'
324
+ S3_COMPATIBLE_STORAGE = 'S3_COMPATIBLE_STORAGE'
325
+ AZURE_BLOB_STORAGE = 'AZURE_BLOB_STORAGE'
326
+ GOOGLE_CLOUD_STORAGE = 'GOOGLE_CLOUD_STORAGE'
327
+ POSTGRESQL = 'POSTGRESQL'
328
+ MYSQL = 'MYSQL'
329
+ MSSQL = 'MSSQL'
330
+ ORACLE = 'ORACLE'
331
+ HIVE = 'HIVE'
332
+ DATABRICKS = 'DATABRICKS'
333
+ SNOWFLAKE = 'SNOWFLAKE'
334
+ MONGODB = 'MONGODB'
335
+ NEO4J = 'NEO4J'
336
+ POWERBI = 'POWERBI'
337
+ TABLEAU = 'TABLEAU'
338
+ CONFLUENCE = 'CONFLUENCE'
339
+ JIRA = 'JIRA'
340
+ SERVICEDESK = 'SERVICEDESK'
341
+ SQLITE = 'SQLITE'
342
+ NOTION = 'NOTION'
343
+ EMAIL = 'EMAIL'
344
+ YOUTUBE = 'YOUTUBE'
345
+
346
+
347
+ class YouTubeRequired(BaseModel):
348
+ """
349
+ Provide at least one of channels or video_urls (enforced at runtime).
350
+ """
351
+
352
+ model_config = ConfigDict(
353
+ extra='forbid',
354
+ )
355
+ channels: list[str] | None = Field(
356
+ None,
357
+ description='Channel URLs or handles to list videos from (e.g. https://www.youtube.com/@OpenAI or @OpenAI). At least one of channels/video_urls is required.',
358
+ )
359
+ video_urls: list[str] | None = Field(
360
+ None,
361
+ description='Explicit video watch URLs to scan (e.g. https://www.youtube.com/watch?v=dQw4w9WgXcQ). At least one of channels/video_urls is required.',
362
+ )
363
+
364
+
365
+ class YouTubeMasked(BaseModel):
366
+ """
367
+ Optional credentials. Leave empty for public videos.
368
+ """
369
+
370
+ model_config = ConfigDict(
371
+ extra='forbid',
372
+ )
373
+ cookies: str | None = Field(
374
+ None,
375
+ description='Netscape-format cookie file contents, used by yt-dlp to access age-restricted or members-only videos.',
376
+ )
377
+
378
+
379
+ class YouTubeOptionalTranscript(BaseModel):
380
+ """
381
+ Transcript/caption fetching controls.
382
+ """
383
+
384
+ model_config = ConfigDict(
385
+ extra='forbid',
386
+ )
387
+ languages: list[str] | None = Field(
388
+ None,
389
+ description='Preferred caption language codes in priority order (e.g. ["en"]). Empty means accept any available language.',
390
+ )
391
+ skip_transcript: bool | None = Field(
392
+ False,
393
+ description='When true, skip transcript fetching entirely (metadata-only assets, no detector content).',
394
+ )
395
+
396
+
397
+ class YouTubeOptionalConnection(BaseModel):
398
+ """
399
+ Network controls for yt-dlp and transcript fetching.
400
+ """
401
+
402
+ model_config = ConfigDict(
403
+ extra='forbid',
404
+ )
405
+ proxy_url: str | None = Field(
406
+ None,
407
+ description='Optional HTTP/HTTPS/SOCKS proxy URL to mitigate rate-limiting when scanning at scale.',
408
+ )
409
+ request_timeout_seconds: int | None = Field(
410
+ 30, description='Socket timeout for yt-dlp network operations.', ge=1, le=300
411
+ )
412
+ ignore_errors: bool | None = Field(
413
+ True,
414
+ description='Continue past individual videos that fail to extract instead of aborting the run.',
415
+ )
416
+
417
+
418
+ class YouTubeOptional(BaseModel):
419
+ model_config = ConfigDict(
420
+ extra='forbid',
421
+ )
422
+ transcript: YouTubeOptionalTranscript | None = None
423
+ connection: YouTubeOptionalConnection | None = None
424
+
425
+
321
426
  class SlackRequired(BaseModel):
322
427
  model_config = ConfigDict(
323
428
  extra='forbid',
@@ -1926,35 +2031,6 @@ class CoreInput(BaseModel):
1926
2031
  resources: ResourceOverrides | None = None
1927
2032
 
1928
2033
 
1929
- class Type(StrEnum):
1930
- """
1931
- Type of the asset or source
1932
- """
1933
-
1934
- WORDPRESS = 'WORDPRESS'
1935
- SLACK = 'SLACK'
1936
- S3_COMPATIBLE_STORAGE = 'S3_COMPATIBLE_STORAGE'
1937
- AZURE_BLOB_STORAGE = 'AZURE_BLOB_STORAGE'
1938
- GOOGLE_CLOUD_STORAGE = 'GOOGLE_CLOUD_STORAGE'
1939
- POSTGRESQL = 'POSTGRESQL'
1940
- MYSQL = 'MYSQL'
1941
- MSSQL = 'MSSQL'
1942
- ORACLE = 'ORACLE'
1943
- HIVE = 'HIVE'
1944
- DATABRICKS = 'DATABRICKS'
1945
- SNOWFLAKE = 'SNOWFLAKE'
1946
- MONGODB = 'MONGODB'
1947
- NEO4J = 'NEO4J'
1948
- POWERBI = 'POWERBI'
1949
- TABLEAU = 'TABLEAU'
1950
- CONFLUENCE = 'CONFLUENCE'
1951
- JIRA = 'JIRA'
1952
- SERVICEDESK = 'SERVICEDESK'
1953
- SQLITE = 'SQLITE'
1954
- NOTION = 'NOTION'
1955
- EMAIL = 'EMAIL'
1956
-
1957
-
1958
2034
  class SlackInput(CoreInput):
1959
2035
  type: Literal['SLACK'] = Field('SLACK', description='Type of the asset or source')
1960
2036
  required: SlackRequired
@@ -2428,7 +2504,7 @@ class ConfluenceOptionalConnection(BaseModel):
2428
2504
  )
2429
2505
 
2430
2506
 
2431
- class Type17(StrEnum):
2507
+ class Type18(StrEnum):
2432
2508
  """
2433
2509
  Filter spaces by space type
2434
2510
  """
@@ -2465,7 +2541,7 @@ class ConfluenceOptionalScopeSpaces(BaseModel):
2465
2541
  keys: list[str] | None = Field(
2466
2542
  None, description='Filter spaces by keys (up to 250)', max_length=250
2467
2543
  )
2468
- type: Type17 | None = Field(None, description='Filter spaces by space type')
2544
+ type: Type18 | None = Field(None, description='Filter spaces by space type')
2469
2545
  status: Status | None = Field(None, description='Filter spaces by status')
2470
2546
  labels: list[str] | None = Field(
2471
2547
  None,
@@ -2731,7 +2807,7 @@ class ServiceDeskOptional(BaseModel):
2731
2807
  content: ServiceDeskOptionalContent | None = None
2732
2808
 
2733
2809
 
2734
- class Type18(StrEnum):
2810
+ class Type19(StrEnum):
2735
2811
  """
2736
2812
  Type of the asset or source
2737
2813
  """
@@ -2758,6 +2834,7 @@ class Type18(StrEnum):
2758
2834
  SQLITE = 'SQLITE'
2759
2835
  NOTION = 'NOTION'
2760
2836
  EMAIL = 'EMAIL'
2837
+ YOUTUBE = 'YOUTUBE'
2761
2838
 
2762
2839
 
2763
2840
  class ConfluenceInput(CoreInput):
@@ -2995,6 +3072,24 @@ class NotionInput(CoreInput):
2995
3072
  resources: ResourceOverrides | None = None
2996
3073
 
2997
3074
 
3075
+ class YouTubeInput(CoreInput):
3076
+ type: Literal['YOUTUBE'] = Field(
3077
+ 'YOUTUBE', description='Type of the asset or source'
3078
+ )
3079
+ required: YouTubeRequired
3080
+ masked: YouTubeMasked | None = None
3081
+ optional: YouTubeOptional | None = None
3082
+ detectors: list[Detector] | None = Field(
3083
+ None, description='Detectors to run on ingested content'
3084
+ )
3085
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
3086
+ None,
3087
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
3088
+ )
3089
+ sampling: SamplingConfig
3090
+ resources: ResourceOverrides | None = None
3091
+
3092
+
2998
3093
  class SourceInput(
2999
3094
  RootModel[
3000
3095
  SlackInput
@@ -3019,6 +3114,7 @@ class SourceInput(
3019
3114
  | SQLiteInput
3020
3115
  | NotionInput
3021
3116
  | EmailInput
3117
+ | YouTubeInput
3022
3118
  ]
3023
3119
  ):
3024
3120
  root: (
@@ -3044,6 +3140,7 @@ class SourceInput(
3044
3140
  | SQLiteInput
3045
3141
  | NotionInput
3046
3142
  | EmailInput
3143
+ | YouTubeInput
3047
3144
  ) = Field(
3048
3145
  ...,
3049
3146
  description='Merged configuration schema with all source types and common definitions',
@@ -214,6 +214,12 @@ class BaseSource(ABC):
214
214
  sampling = getattr(config, "sampling", None) if config is not None else None
215
215
  return bool(getattr(sampling, "enable_ocr", False))
216
216
 
217
+ def transcription_enabled(self) -> bool:
218
+ """Return whether sampling-level audio/video transcription is enabled."""
219
+ config = getattr(self, "config", None)
220
+ sampling = getattr(config, "sampling", None) if config is not None else None
221
+ return bool(getattr(sampling, "enable_transcription", False))
222
+
217
223
  def parse_asset_bytes(
218
224
  self,
219
225
  file_bytes: bytes,
@@ -228,6 +234,7 @@ class BaseSource(ABC):
228
234
  declared_mime_type=declared_mime_type,
229
235
  file_name=file_name,
230
236
  enable_ocr=self.ocr_enabled(),
237
+ enable_transcription=self.transcription_enabled(),
231
238
  )
232
239
 
233
240
  def iter_asset_pages(
@@ -248,6 +255,7 @@ class BaseSource(ABC):
248
255
  include_column_names,
249
256
  file_name=file_name,
250
257
  enable_ocr=self.ocr_enabled(),
258
+ enable_transcription=self.transcription_enabled(),
251
259
  )
252
260
 
253
261
  async def fetch_content_bytes(self, asset_id: str) -> tuple[bytes, str] | None:
@@ -360,14 +360,16 @@ class ObjectStorageSourceBase(BaseSource, ABC):
360
360
  )
361
361
  normalized_mime = mime_type.split(";", 1)[0].strip().lower()
362
362
 
363
- # Non-extractable types (images, audio, video, opaque binary) carry no text.
364
- # Everything else defers extraction to fetch_content_pages() so detectors
365
- # receive content in configurable-sized pages instead of one monolithic blob.
366
- is_non_extractable = normalized_mime.startswith(
367
- ("image/", "audio/", "video/")
368
- ) or normalized_mime in (
369
- "application/octet-stream",
370
- "application/zip",
363
+ # Non-extractable types (images, opaque binary) carry no text. Audio/video
364
+ # are extractable only when transcription is enabled — otherwise they are
365
+ # treated as opaque binary. Everything else defers extraction to
366
+ # fetch_content_pages() so detectors receive content in configurable-sized
367
+ # pages instead of one monolithic blob.
368
+ is_media = normalized_mime.startswith(("audio/", "video/"))
369
+ is_non_extractable = (
370
+ normalized_mime.startswith("image/")
371
+ or (is_media and not self.transcription_enabled())
372
+ or normalized_mime in ("application/octet-stream", "application/zip")
371
373
  )
372
374
 
373
375
  return ContentSnapshot(
@@ -0,0 +1,3 @@
1
+ from .source import YouTubeSource
2
+
3
+ __all__ = ["YouTubeSource"]