detect-file-type-local 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 detect-file-type-skill contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,12 @@
1
+ detect-file-type-skill
2
+ Copyright (c) 2026 detect-file-type-skill contributors
3
+
4
+ This product includes software developed by Google LLC:
5
+
6
+ Magika - AI-powered file type detection
7
+ Copyright 2024 Google LLC
8
+ Licensed under the Apache License, Version 2.0
9
+ https://github.com/google/magika
10
+
11
+ You may obtain a copy of the Apache License, Version 2.0 at:
12
+ http://www.apache.org/licenses/LICENSE-2.0
@@ -0,0 +1,110 @@
1
+ Metadata-Version: 2.4
2
+ Name: detect-file-type-local
3
+ Version: 0.1.0
4
+ Summary: Security-focused local file type detection powered by Google Magika
5
+ License-Expression: MIT
6
+ Requires-Python: >=3.8
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ License-File: NOTICE
10
+ Requires-Dist: magika<2.0.0,>=1.0.0
11
+ Provides-Extra: dev
12
+ Requires-Dist: pytest>=7.0; extra == "dev"
13
+ Requires-Dist: ruff>=0.4; extra == "dev"
14
+ Dynamic: license-file
15
+
16
+ # detect-file-type-local
17
+
18
+ [![CI](https://github.com/pgeraghty/openclaw-detect-file-type-local/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/pgeraghty/openclaw-detect-file-type-local/actions/workflows/ci.yml)
19
+ [![License: MIT](https://img.shields.io/github/license/pgeraghty/openclaw-detect-file-type-local)](LICENSE)
20
+ ![Python 3.8+](https://img.shields.io/badge/python-3.8%2B-blue)
21
+ ![Inference: Local/Offline](https://img.shields.io/badge/inference-local%20%7C%20offline-success)
22
+ ![API Keys](https://img.shields.io/badge/api_keys-none-success)
23
+
24
+ An [OpenClaw](https://openclaw.org) skill for AI-powered local file type detection.
25
+
26
+ Wraps [Google Magika](https://github.com/google/magika) to provide ML-based file type identification that runs entirely offline. No API keys, no network calls — just local inference on an embedded ONNX model.
27
+
28
+ ## Features
29
+
30
+ - **214 file types** detected by content, not extension
31
+ - **Fully offline** — no network access required
32
+ - **Fast** — only reads the bytes needed for classification
33
+ - **Batch support** — process multiple files or entire directories
34
+ - **Multiple output formats** — JSON, human-readable, bare MIME type
35
+ - **Stdin support** — pipe content directly
36
+
37
+ ## Quick Start
38
+
39
+ ```bash
40
+ pip install -e .
41
+
42
+ # Detect a single file
43
+ detect-file-type-local document.pdf
44
+
45
+ # Batch detect
46
+ detect-file-type-local --human *.pdf *.png
47
+
48
+ # Recursive directory scan
49
+ detect-file-type-local -r ./uploads/
50
+
51
+ # Pipe from stdin
52
+ cat mystery_file | detect-file-type-local -
53
+ ```
54
+
55
+ Compatibility alias: `detect-file-type` remains available.
56
+
57
+ ## Output Formats
58
+
59
+ **JSON (default):**
60
+ ```json
61
+ {
62
+ "path": "photo.jpg",
63
+ "label": "jpeg",
64
+ "mime_type": "image/jpeg",
65
+ "score": 0.99,
66
+ "group": "image",
67
+ "description": "JPEG image",
68
+ "is_text": false
69
+ }
70
+ ```
71
+
72
+ **Human-readable:**
73
+ ```
74
+ photo.jpg: JPEG image (image/jpeg) [score: 0.99]
75
+ ```
76
+
77
+ **MIME-only:**
78
+ ```
79
+ image/jpeg
80
+ ```
81
+
82
+ ## OpenClaw Skill
83
+
84
+ See [SKILL.md](SKILL.md) for the OpenClaw skill definition, including structured output schemas and usage guidance for LLM integration.
85
+
86
+ Note: this skill currently uses manual local installation (`pip install -e ...`). Auto-install metadata will be added after a public package artifact is published and resolvable.
87
+
88
+ ## Development
89
+
90
+ ```bash
91
+ pip install -e '.[dev]'
92
+ pytest tests/ -v
93
+ ruff check .
94
+ ```
95
+
96
+ ## Release
97
+
98
+ PyPI publishing is automated via GitHub Actions (`Publish Python Package` workflow):
99
+
100
+ 1. Create a GitHub release with a tag matching package version (for example, `v0.1.0`)
101
+ 2. Workflow builds and validates artifacts
102
+ 3. Workflow publishes to PyPI via trusted publishing
103
+
104
+ After PyPI release, update and republish the ClawHub skill metadata to enable auto-install from `detect-file-type-local`.
105
+
106
+ ## License
107
+
108
+ MIT — see [LICENSE](LICENSE).
109
+
110
+ This project uses [Google Magika](https://github.com/google/magika) (Apache-2.0). See [NOTICE](NOTICE) and [THIRD_PARTY_LICENSES.md](THIRD_PARTY_LICENSES.md).
@@ -0,0 +1,95 @@
1
+ # detect-file-type-local
2
+
3
+ [![CI](https://github.com/pgeraghty/openclaw-detect-file-type-local/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/pgeraghty/openclaw-detect-file-type-local/actions/workflows/ci.yml)
4
+ [![License: MIT](https://img.shields.io/github/license/pgeraghty/openclaw-detect-file-type-local)](LICENSE)
5
+ ![Python 3.8+](https://img.shields.io/badge/python-3.8%2B-blue)
6
+ ![Inference: Local/Offline](https://img.shields.io/badge/inference-local%20%7C%20offline-success)
7
+ ![API Keys](https://img.shields.io/badge/api_keys-none-success)
8
+
9
+ An [OpenClaw](https://openclaw.org) skill for AI-powered local file type detection.
10
+
11
+ Wraps [Google Magika](https://github.com/google/magika) to provide ML-based file type identification that runs entirely offline. No API keys, no network calls — just local inference on an embedded ONNX model.
12
+
13
+ ## Features
14
+
15
+ - **214 file types** detected by content, not extension
16
+ - **Fully offline** — no network access required
17
+ - **Fast** — only reads the bytes needed for classification
18
+ - **Batch support** — process multiple files or entire directories
19
+ - **Multiple output formats** — JSON, human-readable, bare MIME type
20
+ - **Stdin support** — pipe content directly
21
+
22
+ ## Quick Start
23
+
24
+ ```bash
25
+ pip install -e .
26
+
27
+ # Detect a single file
28
+ detect-file-type-local document.pdf
29
+
30
+ # Batch detect
31
+ detect-file-type-local --human *.pdf *.png
32
+
33
+ # Recursive directory scan
34
+ detect-file-type-local -r ./uploads/
35
+
36
+ # Pipe from stdin
37
+ cat mystery_file | detect-file-type-local -
38
+ ```
39
+
40
+ Compatibility alias: `detect-file-type` remains available.
41
+
42
+ ## Output Formats
43
+
44
+ **JSON (default):**
45
+ ```json
46
+ {
47
+ "path": "photo.jpg",
48
+ "label": "jpeg",
49
+ "mime_type": "image/jpeg",
50
+ "score": 0.99,
51
+ "group": "image",
52
+ "description": "JPEG image",
53
+ "is_text": false
54
+ }
55
+ ```
56
+
57
+ **Human-readable:**
58
+ ```
59
+ photo.jpg: JPEG image (image/jpeg) [score: 0.99]
60
+ ```
61
+
62
+ **MIME-only:**
63
+ ```
64
+ image/jpeg
65
+ ```
66
+
67
+ ## OpenClaw Skill
68
+
69
+ See [SKILL.md](SKILL.md) for the OpenClaw skill definition, including structured output schemas and usage guidance for LLM integration.
70
+
71
+ Note: this skill currently uses manual local installation (`pip install -e ...`). Auto-install metadata will be added after a public package artifact is published and resolvable.
72
+
73
+ ## Development
74
+
75
+ ```bash
76
+ pip install -e '.[dev]'
77
+ pytest tests/ -v
78
+ ruff check .
79
+ ```
80
+
81
+ ## Release
82
+
83
+ PyPI publishing is automated via GitHub Actions (`Publish Python Package` workflow):
84
+
85
+ 1. Create a GitHub release with a tag matching package version (for example, `v0.1.0`)
86
+ 2. Workflow builds and validates artifacts
87
+ 3. Workflow publishes to PyPI via trusted publishing
88
+
89
+ After PyPI release, update and republish the ClawHub skill metadata to enable auto-install from `detect-file-type-local`.
90
+
91
+ ## License
92
+
93
+ MIT — see [LICENSE](LICENSE).
94
+
95
+ This project uses [Google Magika](https://github.com/google/magika) (Apache-2.0). See [NOTICE](NOTICE) and [THIRD_PARTY_LICENSES.md](THIRD_PARTY_LICENSES.md).
@@ -0,0 +1,3 @@
1
+ """detect-file-type: AI-powered local file type detection via Google Magika."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,5 @@
1
+ """Allow running as `python -m detect_file_type`."""
2
+
3
+ from detect_file_type.cli import main
4
+
5
+ main()
@@ -0,0 +1,147 @@
1
+ """CLI entry point for detect-file-type."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import os
7
+ import sys
8
+ from pathlib import Path
9
+ from typing import List
10
+
11
+ from magika import Magika
12
+
13
+ from detect_file_type.formatter import (
14
+ format_human,
15
+ format_json,
16
+ format_mime,
17
+ result_to_dict,
18
+ )
19
+
20
+ STDIN_MAX_BYTES = 1_048_576 # 1 MB
21
+
22
+
23
+ def collect_paths(args_paths: List[str], recursive: bool) -> List[str]:
24
+ """Expand directories if --recursive, otherwise return paths as-is."""
25
+ expanded = []
26
+ for p in args_paths:
27
+ if p == "-":
28
+ expanded.append("-")
29
+ continue
30
+ path = Path(p)
31
+ if recursive and path.is_dir():
32
+ for root, _dirs, files in os.walk(path):
33
+ for f in sorted(files):
34
+ expanded.append(os.path.join(root, f))
35
+ else:
36
+ expanded.append(p)
37
+ return expanded
38
+
39
+
40
+ def detect_files(magika_instance: Magika, paths: List[str]) -> tuple:
41
+ """Detect file types. Returns (results_list, had_errors)."""
42
+ results = []
43
+ had_errors = False
44
+
45
+ # Separate stdin from file paths while preserving original indices
46
+ file_entries = [(i, p) for i, p in enumerate(paths) if p != "-"]
47
+ stdin_indices = [i for i, p in enumerate(paths) if p == "-"]
48
+
49
+ # Handle stdin (single stream only)
50
+ if len(stdin_indices) > 1:
51
+ print("error: multiple stdin inputs are not supported; use '-' only once", file=sys.stderr)
52
+ had_errors = True
53
+ elif len(stdin_indices) == 1:
54
+ idx = stdin_indices[0]
55
+ try:
56
+ data = sys.stdin.buffer.read(STDIN_MAX_BYTES)
57
+ result = magika_instance.identify_bytes(data)
58
+ results.append((idx, result_to_dict("-", result)))
59
+ except Exception as e:
60
+ print(f"error: stdin: {e}", file=sys.stderr)
61
+ had_errors = True
62
+
63
+ # Handle file paths
64
+ if file_entries:
65
+ valid_file_entries = []
66
+ path_objects = []
67
+ for idx, p in file_entries:
68
+ pp = Path(p)
69
+ if not pp.exists():
70
+ print(f"error: {p}: No such file or directory", file=sys.stderr)
71
+ had_errors = True
72
+ continue
73
+ if not pp.is_file():
74
+ print(f"error: {p}: Not a regular file", file=sys.stderr)
75
+ had_errors = True
76
+ continue
77
+ try:
78
+ # Check readability
79
+ with open(pp, "rb"):
80
+ pass
81
+ except PermissionError:
82
+ print(f"error: {p}: Permission denied", file=sys.stderr)
83
+ had_errors = True
84
+ continue
85
+ valid_file_entries.append((idx, p))
86
+ path_objects.append(pp)
87
+
88
+ if path_objects:
89
+ try:
90
+ magika_results = magika_instance.identify_paths(path_objects)
91
+ for (idx, p_str), result in zip(valid_file_entries, magika_results):
92
+ results.append((idx, result_to_dict(p_str, result)))
93
+ except Exception as e:
94
+ print(f"error: detection failed: {e}", file=sys.stderr)
95
+ had_errors = True
96
+
97
+ # Sort by original order
98
+ results.sort(key=lambda x: x[0])
99
+ return [r[1] for r in results], had_errors
100
+
101
+
102
+ def main(argv: List[str] | None = None) -> None:
103
+ parser = argparse.ArgumentParser(
104
+ prog="detect-file-type",
105
+ description="AI-powered local file type detection using Google Magika",
106
+ )
107
+ parser.add_argument("paths", nargs="+", help="File paths to detect (use - for stdin)")
108
+ parser.add_argument(
109
+ "--json", dest="format", action="store_const", const="json", help="JSON output (default)"
110
+ )
111
+ parser.add_argument(
112
+ "--human", dest="format", action="store_const", const="human", help="Human-readable output"
113
+ )
114
+ parser.add_argument(
115
+ "--mime", dest="format", action="store_const", const="mime", help="Bare MIME type output"
116
+ )
117
+ parser.add_argument(
118
+ "--recursive", "-r", action="store_true", help="Recurse into directories"
119
+ )
120
+ parser.set_defaults(format="json")
121
+
122
+ args = parser.parse_args(argv)
123
+
124
+ paths = collect_paths(args.paths, args.recursive)
125
+ if not paths:
126
+ print("error: no files to process", file=sys.stderr)
127
+ sys.exit(1)
128
+
129
+ magika_instance = Magika()
130
+ results, had_errors = detect_files(magika_instance, paths)
131
+
132
+ if not results:
133
+ sys.exit(1)
134
+
135
+ if args.format == "json":
136
+ print(format_json(results))
137
+ elif args.format == "human":
138
+ print(format_human(results))
139
+ elif args.format == "mime":
140
+ print(format_mime(results))
141
+
142
+ if had_errors:
143
+ sys.exit(2)
144
+
145
+
146
+ if __name__ == "__main__":
147
+ main()
@@ -0,0 +1,42 @@
1
+ """Output formatting for file type detection results."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from typing import Any, Dict, List
7
+
8
+
9
+ def result_to_dict(path: str, result: Any) -> Dict[str, Any]:
10
+ """Convert a magika result to a plain dict."""
11
+ output = result.output
12
+ return {
13
+ "path": path,
14
+ "label": output.label,
15
+ "mime_type": output.mime_type,
16
+ "score": round(result.score, 4),
17
+ "group": output.group,
18
+ "description": output.description,
19
+ "is_text": output.is_text,
20
+ }
21
+
22
+
23
+ def format_json(results: List[Dict[str, Any]]) -> str:
24
+ """Format results as JSON. Single result returns an object; multiple returns an array."""
25
+ if len(results) == 1:
26
+ return json.dumps(results[0], indent=2)
27
+ return json.dumps(results, indent=2)
28
+
29
+
30
+ def format_human(results: List[Dict[str, Any]]) -> str:
31
+ """Format results as human-readable lines."""
32
+ lines = []
33
+ for r in results:
34
+ lines.append(
35
+ f"{r['path']}: {r['description']} ({r['mime_type']}) [score: {r['score']:.2f}]"
36
+ )
37
+ return "\n".join(lines)
38
+
39
+
40
+ def format_mime(results: List[Dict[str, Any]]) -> str:
41
+ """Format results as bare MIME types, one per line."""
42
+ return "\n".join(r["mime_type"] for r in results)
@@ -0,0 +1,110 @@
1
+ Metadata-Version: 2.4
2
+ Name: detect-file-type-local
3
+ Version: 0.1.0
4
+ Summary: Security-focused local file type detection powered by Google Magika
5
+ License-Expression: MIT
6
+ Requires-Python: >=3.8
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ License-File: NOTICE
10
+ Requires-Dist: magika<2.0.0,>=1.0.0
11
+ Provides-Extra: dev
12
+ Requires-Dist: pytest>=7.0; extra == "dev"
13
+ Requires-Dist: ruff>=0.4; extra == "dev"
14
+ Dynamic: license-file
15
+
16
+ # detect-file-type-local
17
+
18
+ [![CI](https://github.com/pgeraghty/openclaw-detect-file-type-local/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/pgeraghty/openclaw-detect-file-type-local/actions/workflows/ci.yml)
19
+ [![License: MIT](https://img.shields.io/github/license/pgeraghty/openclaw-detect-file-type-local)](LICENSE)
20
+ ![Python 3.8+](https://img.shields.io/badge/python-3.8%2B-blue)
21
+ ![Inference: Local/Offline](https://img.shields.io/badge/inference-local%20%7C%20offline-success)
22
+ ![API Keys](https://img.shields.io/badge/api_keys-none-success)
23
+
24
+ An [OpenClaw](https://openclaw.org) skill for AI-powered local file type detection.
25
+
26
+ Wraps [Google Magika](https://github.com/google/magika) to provide ML-based file type identification that runs entirely offline. No API keys, no network calls — just local inference on an embedded ONNX model.
27
+
28
+ ## Features
29
+
30
+ - **214 file types** detected by content, not extension
31
+ - **Fully offline** — no network access required
32
+ - **Fast** — only reads the bytes needed for classification
33
+ - **Batch support** — process multiple files or entire directories
34
+ - **Multiple output formats** — JSON, human-readable, bare MIME type
35
+ - **Stdin support** — pipe content directly
36
+
37
+ ## Quick Start
38
+
39
+ ```bash
40
+ pip install -e .
41
+
42
+ # Detect a single file
43
+ detect-file-type-local document.pdf
44
+
45
+ # Batch detect
46
+ detect-file-type-local --human *.pdf *.png
47
+
48
+ # Recursive directory scan
49
+ detect-file-type-local -r ./uploads/
50
+
51
+ # Pipe from stdin
52
+ cat mystery_file | detect-file-type-local -
53
+ ```
54
+
55
+ Compatibility alias: `detect-file-type` remains available.
56
+
57
+ ## Output Formats
58
+
59
+ **JSON (default):**
60
+ ```json
61
+ {
62
+ "path": "photo.jpg",
63
+ "label": "jpeg",
64
+ "mime_type": "image/jpeg",
65
+ "score": 0.99,
66
+ "group": "image",
67
+ "description": "JPEG image",
68
+ "is_text": false
69
+ }
70
+ ```
71
+
72
+ **Human-readable:**
73
+ ```
74
+ photo.jpg: JPEG image (image/jpeg) [score: 0.99]
75
+ ```
76
+
77
+ **MIME-only:**
78
+ ```
79
+ image/jpeg
80
+ ```
81
+
82
+ ## OpenClaw Skill
83
+
84
+ See [SKILL.md](SKILL.md) for the OpenClaw skill definition, including structured output schemas and usage guidance for LLM integration.
85
+
86
+ Note: this skill currently uses manual local installation (`pip install -e ...`). Auto-install metadata will be added after a public package artifact is published and resolvable.
87
+
88
+ ## Development
89
+
90
+ ```bash
91
+ pip install -e '.[dev]'
92
+ pytest tests/ -v
93
+ ruff check .
94
+ ```
95
+
96
+ ## Release
97
+
98
+ PyPI publishing is automated via GitHub Actions (`Publish Python Package` workflow):
99
+
100
+ 1. Create a GitHub release with a tag matching package version (for example, `v0.1.0`)
101
+ 2. Workflow builds and validates artifacts
102
+ 3. Workflow publishes to PyPI via trusted publishing
103
+
104
+ After PyPI release, update and republish the ClawHub skill metadata to enable auto-install from `detect-file-type-local`.
105
+
106
+ ## License
107
+
108
+ MIT — see [LICENSE](LICENSE).
109
+
110
+ This project uses [Google Magika](https://github.com/google/magika) (Apache-2.0). See [NOTICE](NOTICE) and [THIRD_PARTY_LICENSES.md](THIRD_PARTY_LICENSES.md).
@@ -0,0 +1,16 @@
1
+ LICENSE
2
+ NOTICE
3
+ README.md
4
+ pyproject.toml
5
+ detect_file_type/__init__.py
6
+ detect_file_type/__main__.py
7
+ detect_file_type/cli.py
8
+ detect_file_type/formatter.py
9
+ detect_file_type_local.egg-info/PKG-INFO
10
+ detect_file_type_local.egg-info/SOURCES.txt
11
+ detect_file_type_local.egg-info/dependency_links.txt
12
+ detect_file_type_local.egg-info/entry_points.txt
13
+ detect_file_type_local.egg-info/requires.txt
14
+ detect_file_type_local.egg-info/top_level.txt
15
+ tests/test_cli.py
16
+ tests/test_detection.py
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ detect-file-type = detect_file_type.cli:main
3
+ detect-file-type-local = detect_file_type.cli:main
@@ -0,0 +1,5 @@
1
+ magika<2.0.0,>=1.0.0
2
+
3
+ [dev]
4
+ pytest>=7.0
5
+ ruff>=0.4
@@ -0,0 +1,34 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "detect-file-type-local"
7
+ version = "0.1.0"
8
+ description = "Security-focused local file type detection powered by Google Magika"
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.8"
12
+ dependencies = [
13
+ "magika>=1.0.0,<2.0.0",
14
+ ]
15
+
16
+ [project.optional-dependencies]
17
+ dev = [
18
+ "pytest>=7.0",
19
+ "ruff>=0.4",
20
+ ]
21
+
22
+ [project.scripts]
23
+ detect-file-type-local = "detect_file_type.cli:main"
24
+ detect-file-type = "detect_file_type.cli:main"
25
+
26
+ [tool.ruff]
27
+ target-version = "py38"
28
+ line-length = 100
29
+
30
+ [tool.ruff.lint]
31
+ select = ["E", "F", "W", "I"]
32
+
33
+ [tool.pytest.ini_options]
34
+ testpaths = ["tests"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,160 @@
1
+ """CLI integration tests."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import subprocess
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ FIXTURES_DIR = Path(__file__).parent / "fixtures"
11
+ CLI_MODULE = [sys.executable, "-m", "detect_file_type"]
12
+
13
+
14
+ def run_cli(*args: str, stdin_data: bytes | None = None) -> subprocess.CompletedProcess:
15
+ return subprocess.run(
16
+ [*CLI_MODULE, *args],
17
+ capture_output=True,
18
+ text=stdin_data is None,
19
+ input=stdin_data if stdin_data is not None else None,
20
+ timeout=60,
21
+ )
22
+
23
+
24
+ def run_cli_text(*args: str) -> subprocess.CompletedProcess:
25
+ return subprocess.run(
26
+ [*CLI_MODULE, *args],
27
+ capture_output=True,
28
+ text=True,
29
+ timeout=60,
30
+ )
31
+
32
+
33
+ class TestJsonOutput:
34
+ def test_single_file_json(self):
35
+ result = run_cli_text(str(FIXTURES_DIR / "sample.png"))
36
+ assert result.returncode == 0
37
+ data = json.loads(result.stdout)
38
+ assert data["label"] == "png"
39
+ assert data["mime_type"] == "image/png"
40
+ assert isinstance(data["score"], float)
41
+ assert data["group"] == "image"
42
+ assert isinstance(data["is_text"], bool)
43
+
44
+ def test_multiple_files_json(self):
45
+ result = run_cli_text(
46
+ str(FIXTURES_DIR / "tiny.txt"),
47
+ str(FIXTURES_DIR / "sample.png"),
48
+ )
49
+ assert result.returncode == 0
50
+ data = json.loads(result.stdout)
51
+ assert isinstance(data, list)
52
+ assert len(data) == 2
53
+
54
+ def test_json_output_parseable(self):
55
+ result = run_cli_text(str(FIXTURES_DIR / "sample.zip"))
56
+ data = json.loads(result.stdout)
57
+ required_keys = {"path", "label", "mime_type", "score", "group", "description", "is_text"}
58
+ assert required_keys.issubset(data.keys())
59
+
60
+ def test_duplicate_paths_preserve_order(self):
61
+ tiny = str(FIXTURES_DIR / "tiny.txt")
62
+ png = str(FIXTURES_DIR / "sample.png")
63
+ result = run_cli_text(tiny, png, tiny)
64
+ assert result.returncode == 0
65
+ data = json.loads(result.stdout)
66
+ assert [item["path"] for item in data] == [tiny, png, tiny]
67
+
68
+
69
+ class TestHumanOutput:
70
+ def test_human_format(self):
71
+ result = run_cli_text("--human", str(FIXTURES_DIR / "sample.png"))
72
+ assert result.returncode == 0
73
+ assert "image/png" in result.stdout
74
+ assert "score:" in result.stdout
75
+
76
+ def test_human_batch(self):
77
+ result = run_cli_text(
78
+ "--human",
79
+ str(FIXTURES_DIR / "tiny.txt"),
80
+ str(FIXTURES_DIR / "sample.png"),
81
+ )
82
+ assert result.returncode == 0
83
+ lines = result.stdout.strip().split("\n")
84
+ assert len(lines) == 2
85
+
86
+
87
+ class TestMimeOutput:
88
+ def test_mime_format(self):
89
+ result = run_cli_text("--mime", str(FIXTURES_DIR / "sample.png"))
90
+ assert result.returncode == 0
91
+ assert result.stdout.strip() == "image/png"
92
+
93
+ def test_mime_batch(self):
94
+ result = run_cli_text(
95
+ "--mime",
96
+ str(FIXTURES_DIR / "tiny.txt"),
97
+ str(FIXTURES_DIR / "sample.png"),
98
+ )
99
+ lines = result.stdout.strip().split("\n")
100
+ assert len(lines) == 2
101
+
102
+
103
+ class TestErrorHandling:
104
+ def test_nonexistent_file(self):
105
+ result = run_cli_text(str(FIXTURES_DIR / "does_not_exist.xyz"))
106
+ assert result.returncode != 0
107
+ assert "No such file" in result.stderr
108
+
109
+ def test_empty_file(self):
110
+ result = run_cli_text(str(FIXTURES_DIR / "empty.bin"))
111
+ assert result.returncode == 0
112
+ data = json.loads(result.stdout)
113
+ assert data["label"] == "empty"
114
+
115
+ def test_partial_failure_exit_code(self):
116
+ result = run_cli_text(
117
+ str(FIXTURES_DIR / "sample.png"),
118
+ str(FIXTURES_DIR / "does_not_exist.xyz"),
119
+ )
120
+ # Should exit 2 (partial failure) — some files succeeded, some failed
121
+ assert result.returncode == 2
122
+
123
+ def test_partial_failure_preserves_success_order_with_duplicates(self):
124
+ tiny = str(FIXTURES_DIR / "tiny.txt")
125
+ png = str(FIXTURES_DIR / "sample.png")
126
+ missing = str(FIXTURES_DIR / "does_not_exist.xyz")
127
+
128
+ result = run_cli_text(tiny, missing, png, tiny)
129
+ assert result.returncode == 2
130
+ data = json.loads(result.stdout)
131
+ assert [item["path"] for item in data] == [tiny, png, tiny]
132
+
133
+
134
+ class TestStdin:
135
+ def test_stdin_detection(self):
136
+ result = subprocess.run(
137
+ [*CLI_MODULE, "-"],
138
+ input=b"Hello, this is plain text content for stdin detection.\n" * 20,
139
+ capture_output=True,
140
+ timeout=60,
141
+ )
142
+ assert result.returncode == 0
143
+ data = json.loads(result.stdout)
144
+ assert data["path"] == "-"
145
+ assert data["group"] == "text"
146
+
147
+ def test_multiple_stdin_inputs_are_rejected(self):
148
+ result = run_cli("-", "-", stdin_data=b"hello from stdin\n" * 20)
149
+ assert result.returncode == 1
150
+ stderr = result.stderr.decode("utf-8", errors="replace")
151
+ assert "multiple stdin inputs are not supported" in stderr
152
+
153
+
154
+ class TestRecursive:
155
+ def test_recursive_directory(self):
156
+ result = run_cli_text("--recursive", "--json", str(FIXTURES_DIR))
157
+ assert result.returncode == 0
158
+ data = json.loads(result.stdout)
159
+ assert isinstance(data, list)
160
+ assert len(data) >= 4 # At least our fixture files
@@ -0,0 +1,60 @@
1
+ """Detection accuracy tests with real fixtures."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ import pytest
8
+ from magika import Magika
9
+
10
+ FIXTURES_DIR = Path(__file__).parent / "fixtures"
11
+
12
+
13
+ @pytest.fixture(scope="module")
14
+ def magika_instance():
15
+ return Magika()
16
+
17
+
18
+ def test_detect_text(magika_instance):
19
+ result = magika_instance.identify_path(FIXTURES_DIR / "tiny.txt")
20
+ assert result.output.group == "text"
21
+
22
+
23
+ def test_detect_png(magika_instance):
24
+ result = magika_instance.identify_path(FIXTURES_DIR / "sample.png")
25
+ assert result.output.label == "png"
26
+ assert result.output.mime_type == "image/png"
27
+
28
+
29
+ def test_detect_zip(magika_instance):
30
+ result = magika_instance.identify_path(FIXTURES_DIR / "sample.zip")
31
+ assert result.output.label == "zip"
32
+
33
+
34
+ def test_detect_empty(magika_instance):
35
+ result = magika_instance.identify_path(FIXTURES_DIR / "empty.bin")
36
+ assert result.output.label == "empty"
37
+
38
+
39
+ def test_detect_misleading_extension(magika_instance):
40
+ """A file with .png extension but text content should detect as text, not PNG."""
41
+ result = magika_instance.identify_path(FIXTURES_DIR / "misleading.txt.png")
42
+ assert result.output.group == "text"
43
+ assert result.output.label != "png"
44
+
45
+
46
+ def test_identify_bytes(magika_instance):
47
+ data = b"Hello, world! This is a plain text test string.\n" * 20
48
+ result = magika_instance.identify_bytes(data)
49
+ assert result.output.group == "text"
50
+
51
+
52
+ def test_batch_detection(magika_instance):
53
+ paths = [
54
+ FIXTURES_DIR / "tiny.txt",
55
+ FIXTURES_DIR / "sample.png",
56
+ FIXTURES_DIR / "sample.zip",
57
+ ]
58
+ results = magika_instance.identify_paths(paths)
59
+ assert len(results) == 3
60
+ assert results[1].output.label == "png"