classifyre-cli 0.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. classifyre_cli-0.4.2/.gitignore +65 -0
  2. classifyre_cli-0.4.2/.python-version +1 -0
  3. classifyre_cli-0.4.2/.turbo/turbo-build.log +3 -0
  4. classifyre_cli-0.4.2/PKG-INFO +167 -0
  5. classifyre_cli-0.4.2/README.md +150 -0
  6. classifyre_cli-0.4.2/main.py +4 -0
  7. classifyre_cli-0.4.2/package.json +17 -0
  8. classifyre_cli-0.4.2/pyproject.toml +323 -0
  9. classifyre_cli-0.4.2/scripts/generate_models.py +88 -0
  10. classifyre_cli-0.4.2/src/__init__.py +1 -0
  11. classifyre_cli-0.4.2/src/detectors/__init__.py +105 -0
  12. classifyre_cli-0.4.2/src/detectors/base.py +97 -0
  13. classifyre_cli-0.4.2/src/detectors/broken_links/__init__.py +3 -0
  14. classifyre_cli-0.4.2/src/detectors/broken_links/detector.py +280 -0
  15. classifyre_cli-0.4.2/src/detectors/config.py +59 -0
  16. classifyre_cli-0.4.2/src/detectors/content/__init__.py +0 -0
  17. classifyre_cli-0.4.2/src/detectors/custom/__init__.py +13 -0
  18. classifyre_cli-0.4.2/src/detectors/custom/detector.py +45 -0
  19. classifyre_cli-0.4.2/src/detectors/custom/runners/__init__.py +56 -0
  20. classifyre_cli-0.4.2/src/detectors/custom/runners/_base.py +177 -0
  21. classifyre_cli-0.4.2/src/detectors/custom/runners/_factory.py +51 -0
  22. classifyre_cli-0.4.2/src/detectors/custom/runners/_feature_extraction.py +138 -0
  23. classifyre_cli-0.4.2/src/detectors/custom/runners/_gliner2.py +324 -0
  24. classifyre_cli-0.4.2/src/detectors/custom/runners/_image_classification.py +98 -0
  25. classifyre_cli-0.4.2/src/detectors/custom/runners/_llm.py +22 -0
  26. classifyre_cli-0.4.2/src/detectors/custom/runners/_object_detection.py +107 -0
  27. classifyre_cli-0.4.2/src/detectors/custom/runners/_regex.py +147 -0
  28. classifyre_cli-0.4.2/src/detectors/custom/runners/_text_classification.py +109 -0
  29. classifyre_cli-0.4.2/src/detectors/custom/trainer.py +293 -0
  30. classifyre_cli-0.4.2/src/detectors/dependencies.py +109 -0
  31. classifyre_cli-0.4.2/src/detectors/pii/__init__.py +0 -0
  32. classifyre_cli-0.4.2/src/detectors/pii/detector.py +883 -0
  33. classifyre_cli-0.4.2/src/detectors/secrets/__init__.py +0 -0
  34. classifyre_cli-0.4.2/src/detectors/secrets/detector.py +399 -0
  35. classifyre_cli-0.4.2/src/detectors/threat/__init__.py +0 -0
  36. classifyre_cli-0.4.2/src/detectors/threat/code_security_detector.py +206 -0
  37. classifyre_cli-0.4.2/src/detectors/threat/yara_detector.py +177 -0
  38. classifyre_cli-0.4.2/src/main.py +608 -0
  39. classifyre_cli-0.4.2/src/models/generated_detectors.py +1296 -0
  40. classifyre_cli-0.4.2/src/models/generated_input.py +2732 -0
  41. classifyre_cli-0.4.2/src/models/generated_single_asset_scan_results.py +240 -0
  42. classifyre_cli-0.4.2/src/outputs/__init__.py +3 -0
  43. classifyre_cli-0.4.2/src/outputs/base.py +69 -0
  44. classifyre_cli-0.4.2/src/outputs/console.py +62 -0
  45. classifyre_cli-0.4.2/src/outputs/factory.py +156 -0
  46. classifyre_cli-0.4.2/src/outputs/file.py +83 -0
  47. classifyre_cli-0.4.2/src/outputs/rest.py +258 -0
  48. classifyre_cli-0.4.2/src/pipeline/__init__.py +7 -0
  49. classifyre_cli-0.4.2/src/pipeline/content_provider.py +26 -0
  50. classifyre_cli-0.4.2/src/pipeline/detector_pipeline.py +742 -0
  51. classifyre_cli-0.4.2/src/pipeline/parsed_content_provider.py +59 -0
  52. classifyre_cli-0.4.2/src/sandbox/__init__.py +5 -0
  53. classifyre_cli-0.4.2/src/sandbox/runner.py +145 -0
  54. classifyre_cli-0.4.2/src/sources/__init__.py +95 -0
  55. classifyre_cli-0.4.2/src/sources/atlassian_common.py +389 -0
  56. classifyre_cli-0.4.2/src/sources/azure_blob_storage/__init__.py +3 -0
  57. classifyre_cli-0.4.2/src/sources/azure_blob_storage/source.py +130 -0
  58. classifyre_cli-0.4.2/src/sources/base.py +296 -0
  59. classifyre_cli-0.4.2/src/sources/confluence/__init__.py +3 -0
  60. classifyre_cli-0.4.2/src/sources/confluence/source.py +733 -0
  61. classifyre_cli-0.4.2/src/sources/databricks/__init__.py +3 -0
  62. classifyre_cli-0.4.2/src/sources/databricks/source.py +1279 -0
  63. classifyre_cli-0.4.2/src/sources/dependencies.py +81 -0
  64. classifyre_cli-0.4.2/src/sources/google_cloud_storage/__init__.py +3 -0
  65. classifyre_cli-0.4.2/src/sources/google_cloud_storage/source.py +114 -0
  66. classifyre_cli-0.4.2/src/sources/hive/__init__.py +3 -0
  67. classifyre_cli-0.4.2/src/sources/hive/source.py +709 -0
  68. classifyre_cli-0.4.2/src/sources/jira/__init__.py +3 -0
  69. classifyre_cli-0.4.2/src/sources/jira/source.py +605 -0
  70. classifyre_cli-0.4.2/src/sources/mongodb/__init__.py +3 -0
  71. classifyre_cli-0.4.2/src/sources/mongodb/source.py +550 -0
  72. classifyre_cli-0.4.2/src/sources/mssql/__init__.py +3 -0
  73. classifyre_cli-0.4.2/src/sources/mssql/source.py +1034 -0
  74. classifyre_cli-0.4.2/src/sources/mysql/__init__.py +3 -0
  75. classifyre_cli-0.4.2/src/sources/mysql/source.py +797 -0
  76. classifyre_cli-0.4.2/src/sources/neo4j/__init__.py +0 -0
  77. classifyre_cli-0.4.2/src/sources/neo4j/source.py +523 -0
  78. classifyre_cli-0.4.2/src/sources/object_storage/base.py +679 -0
  79. classifyre_cli-0.4.2/src/sources/oracle/__init__.py +3 -0
  80. classifyre_cli-0.4.2/src/sources/oracle/source.py +982 -0
  81. classifyre_cli-0.4.2/src/sources/postgresql/__init__.py +3 -0
  82. classifyre_cli-0.4.2/src/sources/postgresql/source.py +774 -0
  83. classifyre_cli-0.4.2/src/sources/powerbi/__init__.py +3 -0
  84. classifyre_cli-0.4.2/src/sources/powerbi/source.py +774 -0
  85. classifyre_cli-0.4.2/src/sources/recipe_normalizer.py +179 -0
  86. classifyre_cli-0.4.2/src/sources/s3_compatible_storage/README.md +66 -0
  87. classifyre_cli-0.4.2/src/sources/s3_compatible_storage/__init__.py +3 -0
  88. classifyre_cli-0.4.2/src/sources/s3_compatible_storage/source.py +150 -0
  89. classifyre_cli-0.4.2/src/sources/servicedesk/__init__.py +3 -0
  90. classifyre_cli-0.4.2/src/sources/servicedesk/source.py +620 -0
  91. classifyre_cli-0.4.2/src/sources/slack/__init__.py +3 -0
  92. classifyre_cli-0.4.2/src/sources/slack/source.py +534 -0
  93. classifyre_cli-0.4.2/src/sources/snowflake/__init__.py +3 -0
  94. classifyre_cli-0.4.2/src/sources/snowflake/source.py +912 -0
  95. classifyre_cli-0.4.2/src/sources/tableau/__init__.py +3 -0
  96. classifyre_cli-0.4.2/src/sources/tableau/source.py +799 -0
  97. classifyre_cli-0.4.2/src/sources/tabular_utils.py +165 -0
  98. classifyre_cli-0.4.2/src/sources/wordpress/__init__.py +3 -0
  99. classifyre_cli-0.4.2/src/sources/wordpress/source.py +590 -0
  100. classifyre_cli-0.4.2/src/telemetry.py +96 -0
  101. classifyre_cli-0.4.2/src/utils/__init__.py +1 -0
  102. classifyre_cli-0.4.2/src/utils/content_extraction.py +108 -0
  103. classifyre_cli-0.4.2/src/utils/file_parser.py +777 -0
  104. classifyre_cli-0.4.2/src/utils/hashing.py +82 -0
  105. classifyre_cli-0.4.2/src/utils/uv_sync.py +79 -0
  106. classifyre_cli-0.4.2/src/utils/validation.py +56 -0
  107. classifyre_cli-0.4.2/tests/__init__.py +0 -0
  108. classifyre_cli-0.4.2/tests/conftest.py +21 -0
  109. classifyre_cli-0.4.2/tests/detectors/__init__.py +0 -0
  110. classifyre_cli-0.4.2/tests/detectors/broken_links/test_broken_links_detector.py +100 -0
  111. classifyre_cli-0.4.2/tests/detectors/conftest.py +173 -0
  112. classifyre_cli-0.4.2/tests/detectors/content/__init__.py +0 -0
  113. classifyre_cli-0.4.2/tests/detectors/custom/__init__.py +0 -0
  114. classifyre_cli-0.4.2/tests/detectors/custom/conftest.py +21 -0
  115. classifyre_cli-0.4.2/tests/detectors/custom/test_invoice_extraction.py +159 -0
  116. classifyre_cli-0.4.2/tests/detectors/custom/test_pipeline_integration.py +220 -0
  117. classifyre_cli-0.4.2/tests/detectors/custom/test_regex_runner.py +367 -0
  118. classifyre_cli-0.4.2/tests/detectors/custom/test_transformer_runners.py +329 -0
  119. classifyre_cli-0.4.2/tests/detectors/pii/__init__.py +0 -0
  120. classifyre_cli-0.4.2/tests/detectors/pii/conftest.py +19 -0
  121. classifyre_cli-0.4.2/tests/detectors/pii/sample_invoice.pdf +0 -0
  122. classifyre_cli-0.4.2/tests/detectors/pii/test_pii_detector.py +511 -0
  123. classifyre_cli-0.4.2/tests/detectors/pii/test_pii_detector_extended.py +177 -0
  124. classifyre_cli-0.4.2/tests/detectors/secrets/__init__.py +0 -0
  125. classifyre_cli-0.4.2/tests/detectors/secrets/test_secrets_detector.py +267 -0
  126. classifyre_cli-0.4.2/tests/detectors/secrets/test_secrets_detector_extended.py +213 -0
  127. classifyre_cli-0.4.2/tests/detectors/test_base_detector.py +147 -0
  128. classifyre_cli-0.4.2/tests/detectors/test_custom_detector_examples_runtime.py +157 -0
  129. classifyre_cli-0.4.2/tests/detectors/test_detector_catalog_commercial.py +72 -0
  130. classifyre_cli-0.4.2/tests/detectors/test_detector_pipeline_types.py +344 -0
  131. classifyre_cli-0.4.2/tests/detectors/test_detector_schema_examples.py +134 -0
  132. classifyre_cli-0.4.2/tests/detectors/test_detector_types.py +253 -0
  133. classifyre_cli-0.4.2/tests/detectors/test_phase2_detectors.py +1 -0
  134. classifyre_cli-0.4.2/tests/detectors/test_registry.py +40 -0
  135. classifyre_cli-0.4.2/tests/detectors/threat/__init__.py +0 -0
  136. classifyre_cli-0.4.2/tests/detectors/threat/test_code_security_detector.py +178 -0
  137. classifyre_cli-0.4.2/tests/detectors/threat/test_yara_detector.py +332 -0
  138. classifyre_cli-0.4.2/tests/integration/test_wordpress_broken_links_detector.py +122 -0
  139. classifyre_cli-0.4.2/tests/integration/test_wordpress_links_assets.py +101 -0
  140. classifyre_cli-0.4.2/tests/pipeline/test_detector_pipeline.py +657 -0
  141. classifyre_cli-0.4.2/tests/test_azure_blob_storage_source.py +83 -0
  142. classifyre_cli-0.4.2/tests/test_base_source_attachment.py +102 -0
  143. classifyre_cli-0.4.2/tests/test_base_source_sampling.py +48 -0
  144. classifyre_cli-0.4.2/tests/test_confluence_source.py +314 -0
  145. classifyre_cli-0.4.2/tests/test_databricks_source.py +417 -0
  146. classifyre_cli-0.4.2/tests/test_google_cloud_storage_source.py +74 -0
  147. classifyre_cli-0.4.2/tests/test_hashing.py +108 -0
  148. classifyre_cli-0.4.2/tests/test_hive_source.py +316 -0
  149. classifyre_cli-0.4.2/tests/test_jira_source.py +401 -0
  150. classifyre_cli-0.4.2/tests/test_mongodb_source.py +347 -0
  151. classifyre_cli-0.4.2/tests/test_mssql_source.py +429 -0
  152. classifyre_cli-0.4.2/tests/test_mysql_source.py +362 -0
  153. classifyre_cli-0.4.2/tests/test_neo4j_source.py +395 -0
  154. classifyre_cli-0.4.2/tests/test_oracle_source.py +334 -0
  155. classifyre_cli-0.4.2/tests/test_outputs.py +335 -0
  156. classifyre_cli-0.4.2/tests/test_postgresql_source.py +519 -0
  157. classifyre_cli-0.4.2/tests/test_powerbi_source.py +361 -0
  158. classifyre_cli-0.4.2/tests/test_recipe_normalizer.py +53 -0
  159. classifyre_cli-0.4.2/tests/test_s3_compatible_storage_source.py +213 -0
  160. classifyre_cli-0.4.2/tests/test_servicedesk_source.py +309 -0
  161. classifyre_cli-0.4.2/tests/test_slack_source.py +208 -0
  162. classifyre_cli-0.4.2/tests/test_snowflake_source.py +329 -0
  163. classifyre_cli-0.4.2/tests/test_source_dependency_groups.py +74 -0
  164. classifyre_cli-0.4.2/tests/test_tableau_source.py +361 -0
  165. classifyre_cli-0.4.2/tests/test_tabular_utils.py +156 -0
  166. classifyre_cli-0.4.2/tests/test_wordpress_source.py +287 -0
  167. classifyre_cli-0.4.2/tests/utils/test_content_extraction.py +150 -0
  168. classifyre_cli-0.4.2/tests/utils/test_file_parser.py +474 -0
  169. classifyre_cli-0.4.2/uv.lock +5560 -0
@@ -0,0 +1,65 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+
8
+ # Virtual environments
9
+ .venv/
10
+ venv/
11
+ ENV/
12
+ env/
13
+
14
+ # Distribution / packaging
15
+ .Python
16
+ build/
17
+ develop-eggs/
18
+ dist/
19
+ downloads/
20
+ eggs/
21
+ .eggs/
22
+ lib/
23
+ lib64/
24
+ parts/
25
+ sdist/
26
+ var/
27
+ wheels/
28
+ *.egg-info/
29
+ .installed.cfg
30
+ *.egg
31
+
32
+ # PyInstaller
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Unit test / coverage reports
37
+ htmlcov/
38
+ .tox/
39
+ .coverage
40
+ .coverage.*
41
+ .cache
42
+ nosetests.xml
43
+ coverage.xml
44
+ *.cover
45
+ .hypothesis/
46
+ .pytest_cache/
47
+
48
+ # mypy
49
+ .mypy_cache/
50
+ .dmypy.json
51
+ dmypy.json
52
+
53
+ # ruff
54
+ .ruff_cache/
55
+
56
+ # IDEs
57
+ .vscode/
58
+ .idea/
59
+ *.swp
60
+ *.swo
61
+ *~
62
+ .DS_Store
63
+
64
+ # Local training artifacts
65
+ checkpoints/
@@ -0,0 +1 @@
1
+ 3.12
@@ -0,0 +1,3 @@
1
+ $ uv sync
2
+ Resolved 256 packages in 271ms
3
+ Checked 49 packages in 2ms
@@ -0,0 +1,167 @@
1
+ Metadata-Version: 2.4
2
+ Name: classifyre-cli
3
+ Version: 0.4.2
4
+ Summary: Classifyre CLI — scan and classify unstructured data sources
5
+ License: MIT
6
+ Keywords: data,ingestion,metadata,pii,secrets,unstructured
7
+ Requires-Python: >=3.12
8
+ Requires-Dist: beautifulsoup4>=4.12.0
9
+ Requires-Dist: classifyre-schemas
10
+ Requires-Dist: email-validator>=2.3.0
11
+ Requires-Dist: en-core-web-sm
12
+ Requires-Dist: jsonschema>=4.26.0
13
+ Requires-Dist: lxml>=6.1.1
14
+ Requires-Dist: pydantic>=2.13.4
15
+ Requires-Dist: requests>=2.34.2
16
+ Description-Content-Type: text/markdown
17
+
18
+ # CLI Application
19
+
20
+ Python CLI for source extraction, detector execution, and batched output delivery.
21
+
22
+ ## Setup
23
+
24
+ ```bash
25
+ cd /unstructured/apps/cli
26
+ uv sync
27
+ # Optional if you want an activated shell instead of `uv run ...`:
28
+ source .venv/bin/activate
29
+ ```
30
+
31
+ Optional detector groups:
32
+
33
+ ```bash
34
+ uv sync --group detectors
35
+ # or specific groups: --group secrets --group pii --group threat ...
36
+ ```
37
+
38
+ ## Command Syntax
39
+
40
+ Use the thin wrapper:
41
+
42
+ ```bash
43
+ uv run main.py <command> <recipe.json> [options]
44
+ ```
45
+
46
+ Or direct module entrypoint:
47
+
48
+ ```bash
49
+ uv run python -m src.main <command> <recipe.json> [options]
50
+ ```
51
+
52
+ Commands:
53
+
54
+ - `test` - test source connection.
55
+ - `discover` - discover source resources.
56
+ - `extract` - run extraction and emit batched output.
57
+ - `sandbox` - run sandbox parsing/detectors for a local file.
58
+
59
+ ## Extract Output Model
60
+
61
+ Extraction always emits in batches.
62
+ Recipes do not contain `output` configuration; output is controlled by CLI flags and environment variables.
63
+
64
+ Output types:
65
+
66
+ - `console` - emits NDJSON envelopes to stdout.
67
+ - `file` - appends NDJSON envelopes to a file.
68
+ - `rest` - pushes batches to API endpoints and finalizes run.
69
+
70
+ Default behavior:
71
+
72
+ - If `source_id` is present (`--source-id` or `SOURCE_ID` env), default output is `rest`.
73
+ - Otherwise default output is `console`.
74
+ - Default batch size is `20`.
75
+
76
+ ## CLI Options
77
+
78
+ Global/common:
79
+
80
+ - `--debug` - enable debug logging.
81
+ - `--detectors-file <path>` - sandbox only.
82
+
83
+ Extract output options:
84
+
85
+ - `--output-type rest|file|console`
86
+ - `--output-batch-size <int>`
87
+ - `--output-rest-url <url>`
88
+ - `--output-file-path <path>`
89
+ - `--source-id <uuid>`
90
+ - `--runner-id <uuid>`
91
+ - `--managed-runner` (REST only; runner lifecycle managed by API orchestrator)
92
+
93
+ Environment fallbacks:
94
+
95
+ - `SOURCE_ID`, `RUNNER_ID`
96
+ - `CLASSIFYRE_OUTPUT_TYPE`, `CLASSIFYRE_OUTPUT_BATCH_SIZE`
97
+ - `CLASSIFYRE_OUTPUT_REST_URL`, `CLASSIFYRE_OUTPUT_REST_TIMEOUT_SEC`
98
+ - `CLASSIFYRE_OUTPUT_FILE_PATH`
99
+ - `API_URL` (fallback base URL for REST output)
100
+
101
+ ## Practical Examples
102
+
103
+ ### 1) Console output (quick local test)
104
+
105
+ ```bash
106
+ uv run main.py extract ./wordpress-recipe.json --output-type console --output-batch-size 1
107
+ ```
108
+
109
+ You will see NDJSON lines like:
110
+
111
+ - `{"event":"batch", ...}`
112
+ - `{"event":"finish", ...}`
113
+
114
+ ### 2) File output
115
+
116
+ ```bash
117
+ uv run main.py extract ./wordpress-recipe.json \
118
+ --output-type file \
119
+ --output-file-path /tmp/classifyre-assets.ndjson \
120
+ --output-batch-size 20
121
+ ```
122
+
123
+ ### 3) REST output (manual CLI to backend)
124
+
125
+ ```bash
126
+ uv run main.py extract ./wordpress-recipe.json \
127
+ --output-type rest \
128
+ --source-id <source_uuid>
129
+ ```
130
+
131
+ Notes:
132
+
133
+ - `--runner-id` optional for manual runs. If omitted, CLI creates external runner automatically.
134
+ - `--output-rest-url` is optional. If omitted, CLI uses `CLASSIFYRE_OUTPUT_REST_URL`, then `API_URL`, then `http://localhost:8000`.
135
+ - `--managed-runner` should be used only for API-orchestrated runs where runner already exists.
136
+
137
+ ### 4) REST output with explicit runner (managed/orchestrated style)
138
+
139
+ ```bash
140
+ uv run main.py extract ./wordpress-recipe.json \
141
+ --output-type rest \
142
+ --source-id <source_uuid> \
143
+ --runner-id <runner_uuid> \
144
+ --managed-runner
145
+ ```
146
+
147
+ ### 5) Full extract command with all output flags
148
+
149
+ ```bash
150
+ uv run main.py extract ./wordpress-recipe.json \
151
+ --output-type rest \
152
+ --output-batch-size 20 \
153
+ --output-rest-url http://localhost:8000 \
154
+ --output-file-path /tmp/classifyre-assets.ndjson \
155
+ --source-id <source_uuid> \
156
+ --runner-id <runner_uuid> \
157
+ --managed-runner
158
+ ```
159
+
160
+ Use `--output-file-path` only when `--output-type file`.
161
+
162
+ ## Dev Scripts
163
+
164
+ - `bun run dev` - run CLI quickly.
165
+ - `bun run lint` - ruff format/check.
166
+ - `bun run check-types` - mypy.
167
+ - `bun run test` - pytest suite.
@@ -0,0 +1,150 @@
1
+ # CLI Application
2
+
3
+ Python CLI for source extraction, detector execution, and batched output delivery.
4
+
5
+ ## Setup
6
+
7
+ ```bash
8
+ cd /unstructured/apps/cli
9
+ uv sync
10
+ # Optional if you want an activated shell instead of `uv run ...`:
11
+ source .venv/bin/activate
12
+ ```
13
+
14
+ Optional detector groups:
15
+
16
+ ```bash
17
+ uv sync --group detectors
18
+ # or specific groups: --group secrets --group pii --group threat ...
19
+ ```
20
+
21
+ ## Command Syntax
22
+
23
+ Use the thin wrapper:
24
+
25
+ ```bash
26
+ uv run main.py <command> <recipe.json> [options]
27
+ ```
28
+
29
+ Or direct module entrypoint:
30
+
31
+ ```bash
32
+ uv run python -m src.main <command> <recipe.json> [options]
33
+ ```
34
+
35
+ Commands:
36
+
37
+ - `test` - test source connection.
38
+ - `discover` - discover source resources.
39
+ - `extract` - run extraction and emit batched output.
40
+ - `sandbox` - run sandbox parsing/detectors for a local file.
41
+
42
+ ## Extract Output Model
43
+
44
+ Extraction always emits in batches.
45
+ Recipes do not contain `output` configuration; output is controlled by CLI flags and environment variables.
46
+
47
+ Output types:
48
+
49
+ - `console` - emits NDJSON envelopes to stdout.
50
+ - `file` - appends NDJSON envelopes to a file.
51
+ - `rest` - pushes batches to API endpoints and finalizes run.
52
+
53
+ Default behavior:
54
+
55
+ - If `source_id` is present (`--source-id` or `SOURCE_ID` env), default output is `rest`.
56
+ - Otherwise default output is `console`.
57
+ - Default batch size is `20`.
58
+
59
+ ## CLI Options
60
+
61
+ Global/common:
62
+
63
+ - `--debug` - enable debug logging.
64
+ - `--detectors-file <path>` - sandbox only.
65
+
66
+ Extract output options:
67
+
68
+ - `--output-type rest|file|console`
69
+ - `--output-batch-size <int>`
70
+ - `--output-rest-url <url>`
71
+ - `--output-file-path <path>`
72
+ - `--source-id <uuid>`
73
+ - `--runner-id <uuid>`
74
+ - `--managed-runner` (REST only; runner lifecycle managed by API orchestrator)
75
+
76
+ Environment fallbacks:
77
+
78
+ - `SOURCE_ID`, `RUNNER_ID`
79
+ - `CLASSIFYRE_OUTPUT_TYPE`, `CLASSIFYRE_OUTPUT_BATCH_SIZE`
80
+ - `CLASSIFYRE_OUTPUT_REST_URL`, `CLASSIFYRE_OUTPUT_REST_TIMEOUT_SEC`
81
+ - `CLASSIFYRE_OUTPUT_FILE_PATH`
82
+ - `API_URL` (fallback base URL for REST output)
83
+
84
+ ## Practical Examples
85
+
86
+ ### 1) Console output (quick local test)
87
+
88
+ ```bash
89
+ uv run main.py extract ./wordpress-recipe.json --output-type console --output-batch-size 1
90
+ ```
91
+
92
+ You will see NDJSON lines like:
93
+
94
+ - `{"event":"batch", ...}`
95
+ - `{"event":"finish", ...}`
96
+
97
+ ### 2) File output
98
+
99
+ ```bash
100
+ uv run main.py extract ./wordpress-recipe.json \
101
+ --output-type file \
102
+ --output-file-path /tmp/classifyre-assets.ndjson \
103
+ --output-batch-size 20
104
+ ```
105
+
106
+ ### 3) REST output (manual CLI to backend)
107
+
108
+ ```bash
109
+ uv run main.py extract ./wordpress-recipe.json \
110
+ --output-type rest \
111
+ --source-id <source_uuid>
112
+ ```
113
+
114
+ Notes:
115
+
116
+ - `--runner-id` optional for manual runs. If omitted, CLI creates external runner automatically.
117
+ - `--output-rest-url` is optional. If omitted, CLI uses `CLASSIFYRE_OUTPUT_REST_URL`, then `API_URL`, then `http://localhost:8000`.
118
+ - `--managed-runner` should be used only for API-orchestrated runs where runner already exists.
119
+
120
+ ### 4) REST output with explicit runner (managed/orchestrated style)
121
+
122
+ ```bash
123
+ uv run main.py extract ./wordpress-recipe.json \
124
+ --output-type rest \
125
+ --source-id <source_uuid> \
126
+ --runner-id <runner_uuid> \
127
+ --managed-runner
128
+ ```
129
+
130
+ ### 5) Full extract command with all output flags
131
+
132
+ ```bash
133
+ uv run main.py extract ./wordpress-recipe.json \
134
+ --output-type rest \
135
+ --output-batch-size 20 \
136
+ --output-rest-url http://localhost:8000 \
137
+ --output-file-path /tmp/classifyre-assets.ndjson \
138
+ --source-id <source_uuid> \
139
+ --runner-id <runner_uuid> \
140
+ --managed-runner
141
+ ```
142
+
143
+ Use `--output-file-path` only when `--output-type file`.
144
+
145
+ ## Dev Scripts
146
+
147
+ - `bun run dev` - run CLI quickly.
148
+ - `bun run lint` - ruff format/check.
149
+ - `bun run check-types` - mypy.
150
+ - `bun run test` - pytest suite.
@@ -0,0 +1,4 @@
1
+ from src.main import main
2
+
3
+ if __name__ == "__main__":
4
+ main()
@@ -0,0 +1,17 @@
1
+ {
2
+ "name": "@classifyre/cli",
3
+ "version": "0.4.2",
4
+ "private": true,
5
+ "scripts": {
6
+ "build": "uv sync",
7
+ "dev": "uv run main.py",
8
+ "lint": "uv run ruff check . --fix && uv run ruff format .",
9
+ "check-types": "uv run mypy .",
10
+ "test": "uv sync --group dev --group file-processing && uv run pytest",
11
+ "test:integration": "uv run pytest tests/integration/ -m integration",
12
+ "test:integration:run": "RUN_INTEGRATION_TESTS=1 uv run pytest tests/integration/ -m integration",
13
+ "e2e": "bun run test:e2e",
14
+ "test:e2e": "if rg -q '@pytest\\.mark\\.e2e' tests; then RUN_E2E_TESTS=1 uv run pytest -m e2e; else echo 'No CLI e2e tests collected; treating as pass'; fi",
15
+ "codegen": "uv run --python 3.12 --group dev --locked python scripts/generate_models.py"
16
+ }
17
+ }