detect-file-type-local 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- detect_file_type_local-0.1.0/LICENSE +21 -0
- detect_file_type_local-0.1.0/NOTICE +12 -0
- detect_file_type_local-0.1.0/PKG-INFO +110 -0
- detect_file_type_local-0.1.0/README.md +95 -0
- detect_file_type_local-0.1.0/detect_file_type/__init__.py +3 -0
- detect_file_type_local-0.1.0/detect_file_type/__main__.py +5 -0
- detect_file_type_local-0.1.0/detect_file_type/cli.py +147 -0
- detect_file_type_local-0.1.0/detect_file_type/formatter.py +42 -0
- detect_file_type_local-0.1.0/detect_file_type_local.egg-info/PKG-INFO +110 -0
- detect_file_type_local-0.1.0/detect_file_type_local.egg-info/SOURCES.txt +16 -0
- detect_file_type_local-0.1.0/detect_file_type_local.egg-info/dependency_links.txt +1 -0
- detect_file_type_local-0.1.0/detect_file_type_local.egg-info/entry_points.txt +3 -0
- detect_file_type_local-0.1.0/detect_file_type_local.egg-info/requires.txt +5 -0
- detect_file_type_local-0.1.0/detect_file_type_local.egg-info/top_level.txt +1 -0
- detect_file_type_local-0.1.0/pyproject.toml +34 -0
- detect_file_type_local-0.1.0/setup.cfg +4 -0
- detect_file_type_local-0.1.0/tests/test_cli.py +160 -0
- detect_file_type_local-0.1.0/tests/test_detection.py +60 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 detect-file-type-skill contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
detect-file-type-skill
|
|
2
|
+
Copyright (c) 2026 detect-file-type-skill contributors
|
|
3
|
+
|
|
4
|
+
This product includes software developed by Google LLC:
|
|
5
|
+
|
|
6
|
+
Magika - AI-powered file type detection
|
|
7
|
+
Copyright 2024 Google LLC
|
|
8
|
+
Licensed under the Apache License, Version 2.0
|
|
9
|
+
https://github.com/google/magika
|
|
10
|
+
|
|
11
|
+
You may obtain a copy of the Apache License, Version 2.0 at:
|
|
12
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: detect-file-type-local
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Security-focused local file type detection powered by Google Magika
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Requires-Python: >=3.8
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
License-File: NOTICE
|
|
10
|
+
Requires-Dist: magika<2.0.0,>=1.0.0
|
|
11
|
+
Provides-Extra: dev
|
|
12
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
13
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
14
|
+
Dynamic: license-file
|
|
15
|
+
|
|
16
|
+
# detect-file-type-local
|
|
17
|
+
|
|
18
|
+
[](https://github.com/pgeraghty/openclaw-detect-file-type-local/actions/workflows/ci.yml)
|
|
19
|
+
[](LICENSE)
|
|
20
|
+

|
|
21
|
+

|
|
22
|
+

|
|
23
|
+
|
|
24
|
+
An [OpenClaw](https://openclaw.org) skill for AI-powered local file type detection.
|
|
25
|
+
|
|
26
|
+
Wraps [Google Magika](https://github.com/google/magika) to provide ML-based file type identification that runs entirely offline. No API keys, no network calls — just local inference on an embedded ONNX model.
|
|
27
|
+
|
|
28
|
+
## Features
|
|
29
|
+
|
|
30
|
+
- **214 file types** detected by content, not extension
|
|
31
|
+
- **Fully offline** — no network access required
|
|
32
|
+
- **Fast** — only reads the bytes needed for classification
|
|
33
|
+
- **Batch support** — process multiple files or entire directories
|
|
34
|
+
- **Multiple output formats** — JSON, human-readable, bare MIME type
|
|
35
|
+
- **Stdin support** — pipe content directly
|
|
36
|
+
|
|
37
|
+
## Quick Start
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install -e .
|
|
41
|
+
|
|
42
|
+
# Detect a single file
|
|
43
|
+
detect-file-type-local document.pdf
|
|
44
|
+
|
|
45
|
+
# Batch detect
|
|
46
|
+
detect-file-type-local --human *.pdf *.png
|
|
47
|
+
|
|
48
|
+
# Recursive directory scan
|
|
49
|
+
detect-file-type-local -r ./uploads/
|
|
50
|
+
|
|
51
|
+
# Pipe from stdin
|
|
52
|
+
cat mystery_file | detect-file-type-local -
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
Compatibility alias: `detect-file-type` remains available.
|
|
56
|
+
|
|
57
|
+
## Output Formats
|
|
58
|
+
|
|
59
|
+
**JSON (default):**
|
|
60
|
+
```json
|
|
61
|
+
{
|
|
62
|
+
"path": "photo.jpg",
|
|
63
|
+
"label": "jpeg",
|
|
64
|
+
"mime_type": "image/jpeg",
|
|
65
|
+
"score": 0.99,
|
|
66
|
+
"group": "image",
|
|
67
|
+
"description": "JPEG image",
|
|
68
|
+
"is_text": false
|
|
69
|
+
}
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
**Human-readable:**
|
|
73
|
+
```
|
|
74
|
+
photo.jpg: JPEG image (image/jpeg) [score: 0.99]
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
**MIME-only:**
|
|
78
|
+
```
|
|
79
|
+
image/jpeg
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## OpenClaw Skill
|
|
83
|
+
|
|
84
|
+
See [SKILL.md](SKILL.md) for the OpenClaw skill definition, including structured output schemas and usage guidance for LLM integration.
|
|
85
|
+
|
|
86
|
+
Note: this skill currently uses manual local installation (`pip install -e ...`). Auto-install metadata will be added after a public package artifact is published and resolvable.
|
|
87
|
+
|
|
88
|
+
## Development
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
pip install -e '.[dev]'
|
|
92
|
+
pytest tests/ -v
|
|
93
|
+
ruff check .
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Release
|
|
97
|
+
|
|
98
|
+
PyPI publishing is automated via GitHub Actions (`Publish Python Package` workflow):
|
|
99
|
+
|
|
100
|
+
1. Create a GitHub release with a tag matching package version (for example, `v0.1.0`)
|
|
101
|
+
2. Workflow builds and validates artifacts
|
|
102
|
+
3. Workflow publishes to PyPI via trusted publishing
|
|
103
|
+
|
|
104
|
+
After PyPI release, update and republish the ClawHub skill metadata to enable auto-install from `detect-file-type-local`.
|
|
105
|
+
|
|
106
|
+
## License
|
|
107
|
+
|
|
108
|
+
MIT — see [LICENSE](LICENSE).
|
|
109
|
+
|
|
110
|
+
This project uses [Google Magika](https://github.com/google/magika) (Apache-2.0). See [NOTICE](NOTICE) and [THIRD_PARTY_LICENSES.md](THIRD_PARTY_LICENSES.md).
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
# detect-file-type-local
|
|
2
|
+
|
|
3
|
+
[](https://github.com/pgeraghty/openclaw-detect-file-type-local/actions/workflows/ci.yml)
|
|
4
|
+
[](LICENSE)
|
|
5
|
+

|
|
6
|
+

|
|
7
|
+

|
|
8
|
+
|
|
9
|
+
An [OpenClaw](https://openclaw.org) skill for AI-powered local file type detection.
|
|
10
|
+
|
|
11
|
+
Wraps [Google Magika](https://github.com/google/magika) to provide ML-based file type identification that runs entirely offline. No API keys, no network calls — just local inference on an embedded ONNX model.
|
|
12
|
+
|
|
13
|
+
## Features
|
|
14
|
+
|
|
15
|
+
- **214 file types** detected by content, not extension
|
|
16
|
+
- **Fully offline** — no network access required
|
|
17
|
+
- **Fast** — only reads the bytes needed for classification
|
|
18
|
+
- **Batch support** — process multiple files or entire directories
|
|
19
|
+
- **Multiple output formats** — JSON, human-readable, bare MIME type
|
|
20
|
+
- **Stdin support** — pipe content directly
|
|
21
|
+
|
|
22
|
+
## Quick Start
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install -e .
|
|
26
|
+
|
|
27
|
+
# Detect a single file
|
|
28
|
+
detect-file-type-local document.pdf
|
|
29
|
+
|
|
30
|
+
# Batch detect
|
|
31
|
+
detect-file-type-local --human *.pdf *.png
|
|
32
|
+
|
|
33
|
+
# Recursive directory scan
|
|
34
|
+
detect-file-type-local -r ./uploads/
|
|
35
|
+
|
|
36
|
+
# Pipe from stdin
|
|
37
|
+
cat mystery_file | detect-file-type-local -
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Compatibility alias: `detect-file-type` remains available.
|
|
41
|
+
|
|
42
|
+
## Output Formats
|
|
43
|
+
|
|
44
|
+
**JSON (default):**
|
|
45
|
+
```json
|
|
46
|
+
{
|
|
47
|
+
"path": "photo.jpg",
|
|
48
|
+
"label": "jpeg",
|
|
49
|
+
"mime_type": "image/jpeg",
|
|
50
|
+
"score": 0.99,
|
|
51
|
+
"group": "image",
|
|
52
|
+
"description": "JPEG image",
|
|
53
|
+
"is_text": false
|
|
54
|
+
}
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
**Human-readable:**
|
|
58
|
+
```
|
|
59
|
+
photo.jpg: JPEG image (image/jpeg) [score: 0.99]
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
**MIME-only:**
|
|
63
|
+
```
|
|
64
|
+
image/jpeg
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## OpenClaw Skill
|
|
68
|
+
|
|
69
|
+
See [SKILL.md](SKILL.md) for the OpenClaw skill definition, including structured output schemas and usage guidance for LLM integration.
|
|
70
|
+
|
|
71
|
+
Note: this skill currently uses manual local installation (`pip install -e ...`). Auto-install metadata will be added after a public package artifact is published and resolvable.
|
|
72
|
+
|
|
73
|
+
## Development
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
pip install -e '.[dev]'
|
|
77
|
+
pytest tests/ -v
|
|
78
|
+
ruff check .
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Release
|
|
82
|
+
|
|
83
|
+
PyPI publishing is automated via GitHub Actions (`Publish Python Package` workflow):
|
|
84
|
+
|
|
85
|
+
1. Create a GitHub release with a tag matching package version (for example, `v0.1.0`)
|
|
86
|
+
2. Workflow builds and validates artifacts
|
|
87
|
+
3. Workflow publishes to PyPI via trusted publishing
|
|
88
|
+
|
|
89
|
+
After PyPI release, update and republish the ClawHub skill metadata to enable auto-install from `detect-file-type-local`.
|
|
90
|
+
|
|
91
|
+
## License
|
|
92
|
+
|
|
93
|
+
MIT — see [LICENSE](LICENSE).
|
|
94
|
+
|
|
95
|
+
This project uses [Google Magika](https://github.com/google/magika) (Apache-2.0). See [NOTICE](NOTICE) and [THIRD_PARTY_LICENSES.md](THIRD_PARTY_LICENSES.md).
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""CLI entry point for detect-file-type."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import os
|
|
7
|
+
import sys
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import List
|
|
10
|
+
|
|
11
|
+
from magika import Magika
|
|
12
|
+
|
|
13
|
+
from detect_file_type.formatter import (
|
|
14
|
+
format_human,
|
|
15
|
+
format_json,
|
|
16
|
+
format_mime,
|
|
17
|
+
result_to_dict,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
STDIN_MAX_BYTES = 1_048_576 # 1 MB
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def collect_paths(args_paths: List[str], recursive: bool) -> List[str]:
|
|
24
|
+
"""Expand directories if --recursive, otherwise return paths as-is."""
|
|
25
|
+
expanded = []
|
|
26
|
+
for p in args_paths:
|
|
27
|
+
if p == "-":
|
|
28
|
+
expanded.append("-")
|
|
29
|
+
continue
|
|
30
|
+
path = Path(p)
|
|
31
|
+
if recursive and path.is_dir():
|
|
32
|
+
for root, _dirs, files in os.walk(path):
|
|
33
|
+
for f in sorted(files):
|
|
34
|
+
expanded.append(os.path.join(root, f))
|
|
35
|
+
else:
|
|
36
|
+
expanded.append(p)
|
|
37
|
+
return expanded
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def detect_files(magika_instance: Magika, paths: List[str]) -> tuple:
|
|
41
|
+
"""Detect file types. Returns (results_list, had_errors)."""
|
|
42
|
+
results = []
|
|
43
|
+
had_errors = False
|
|
44
|
+
|
|
45
|
+
# Separate stdin from file paths while preserving original indices
|
|
46
|
+
file_entries = [(i, p) for i, p in enumerate(paths) if p != "-"]
|
|
47
|
+
stdin_indices = [i for i, p in enumerate(paths) if p == "-"]
|
|
48
|
+
|
|
49
|
+
# Handle stdin (single stream only)
|
|
50
|
+
if len(stdin_indices) > 1:
|
|
51
|
+
print("error: multiple stdin inputs are not supported; use '-' only once", file=sys.stderr)
|
|
52
|
+
had_errors = True
|
|
53
|
+
elif len(stdin_indices) == 1:
|
|
54
|
+
idx = stdin_indices[0]
|
|
55
|
+
try:
|
|
56
|
+
data = sys.stdin.buffer.read(STDIN_MAX_BYTES)
|
|
57
|
+
result = magika_instance.identify_bytes(data)
|
|
58
|
+
results.append((idx, result_to_dict("-", result)))
|
|
59
|
+
except Exception as e:
|
|
60
|
+
print(f"error: stdin: {e}", file=sys.stderr)
|
|
61
|
+
had_errors = True
|
|
62
|
+
|
|
63
|
+
# Handle file paths
|
|
64
|
+
if file_entries:
|
|
65
|
+
valid_file_entries = []
|
|
66
|
+
path_objects = []
|
|
67
|
+
for idx, p in file_entries:
|
|
68
|
+
pp = Path(p)
|
|
69
|
+
if not pp.exists():
|
|
70
|
+
print(f"error: {p}: No such file or directory", file=sys.stderr)
|
|
71
|
+
had_errors = True
|
|
72
|
+
continue
|
|
73
|
+
if not pp.is_file():
|
|
74
|
+
print(f"error: {p}: Not a regular file", file=sys.stderr)
|
|
75
|
+
had_errors = True
|
|
76
|
+
continue
|
|
77
|
+
try:
|
|
78
|
+
# Check readability
|
|
79
|
+
with open(pp, "rb"):
|
|
80
|
+
pass
|
|
81
|
+
except PermissionError:
|
|
82
|
+
print(f"error: {p}: Permission denied", file=sys.stderr)
|
|
83
|
+
had_errors = True
|
|
84
|
+
continue
|
|
85
|
+
valid_file_entries.append((idx, p))
|
|
86
|
+
path_objects.append(pp)
|
|
87
|
+
|
|
88
|
+
if path_objects:
|
|
89
|
+
try:
|
|
90
|
+
magika_results = magika_instance.identify_paths(path_objects)
|
|
91
|
+
for (idx, p_str), result in zip(valid_file_entries, magika_results):
|
|
92
|
+
results.append((idx, result_to_dict(p_str, result)))
|
|
93
|
+
except Exception as e:
|
|
94
|
+
print(f"error: detection failed: {e}", file=sys.stderr)
|
|
95
|
+
had_errors = True
|
|
96
|
+
|
|
97
|
+
# Sort by original order
|
|
98
|
+
results.sort(key=lambda x: x[0])
|
|
99
|
+
return [r[1] for r in results], had_errors
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def main(argv: List[str] | None = None) -> None:
|
|
103
|
+
parser = argparse.ArgumentParser(
|
|
104
|
+
prog="detect-file-type",
|
|
105
|
+
description="AI-powered local file type detection using Google Magika",
|
|
106
|
+
)
|
|
107
|
+
parser.add_argument("paths", nargs="+", help="File paths to detect (use - for stdin)")
|
|
108
|
+
parser.add_argument(
|
|
109
|
+
"--json", dest="format", action="store_const", const="json", help="JSON output (default)"
|
|
110
|
+
)
|
|
111
|
+
parser.add_argument(
|
|
112
|
+
"--human", dest="format", action="store_const", const="human", help="Human-readable output"
|
|
113
|
+
)
|
|
114
|
+
parser.add_argument(
|
|
115
|
+
"--mime", dest="format", action="store_const", const="mime", help="Bare MIME type output"
|
|
116
|
+
)
|
|
117
|
+
parser.add_argument(
|
|
118
|
+
"--recursive", "-r", action="store_true", help="Recurse into directories"
|
|
119
|
+
)
|
|
120
|
+
parser.set_defaults(format="json")
|
|
121
|
+
|
|
122
|
+
args = parser.parse_args(argv)
|
|
123
|
+
|
|
124
|
+
paths = collect_paths(args.paths, args.recursive)
|
|
125
|
+
if not paths:
|
|
126
|
+
print("error: no files to process", file=sys.stderr)
|
|
127
|
+
sys.exit(1)
|
|
128
|
+
|
|
129
|
+
magika_instance = Magika()
|
|
130
|
+
results, had_errors = detect_files(magika_instance, paths)
|
|
131
|
+
|
|
132
|
+
if not results:
|
|
133
|
+
sys.exit(1)
|
|
134
|
+
|
|
135
|
+
if args.format == "json":
|
|
136
|
+
print(format_json(results))
|
|
137
|
+
elif args.format == "human":
|
|
138
|
+
print(format_human(results))
|
|
139
|
+
elif args.format == "mime":
|
|
140
|
+
print(format_mime(results))
|
|
141
|
+
|
|
142
|
+
if had_errors:
|
|
143
|
+
sys.exit(2)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
if __name__ == "__main__":
|
|
147
|
+
main()
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Output formatting for file type detection results."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import Any, Dict, List
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def result_to_dict(path: str, result: Any) -> Dict[str, Any]:
|
|
10
|
+
"""Convert a magika result to a plain dict."""
|
|
11
|
+
output = result.output
|
|
12
|
+
return {
|
|
13
|
+
"path": path,
|
|
14
|
+
"label": output.label,
|
|
15
|
+
"mime_type": output.mime_type,
|
|
16
|
+
"score": round(result.score, 4),
|
|
17
|
+
"group": output.group,
|
|
18
|
+
"description": output.description,
|
|
19
|
+
"is_text": output.is_text,
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def format_json(results: List[Dict[str, Any]]) -> str:
|
|
24
|
+
"""Format results as JSON. Single result returns an object; multiple returns an array."""
|
|
25
|
+
if len(results) == 1:
|
|
26
|
+
return json.dumps(results[0], indent=2)
|
|
27
|
+
return json.dumps(results, indent=2)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def format_human(results: List[Dict[str, Any]]) -> str:
|
|
31
|
+
"""Format results as human-readable lines."""
|
|
32
|
+
lines = []
|
|
33
|
+
for r in results:
|
|
34
|
+
lines.append(
|
|
35
|
+
f"{r['path']}: {r['description']} ({r['mime_type']}) [score: {r['score']:.2f}]"
|
|
36
|
+
)
|
|
37
|
+
return "\n".join(lines)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def format_mime(results: List[Dict[str, Any]]) -> str:
|
|
41
|
+
"""Format results as bare MIME types, one per line."""
|
|
42
|
+
return "\n".join(r["mime_type"] for r in results)
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: detect-file-type-local
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Security-focused local file type detection powered by Google Magika
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Requires-Python: >=3.8
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
License-File: NOTICE
|
|
10
|
+
Requires-Dist: magika<2.0.0,>=1.0.0
|
|
11
|
+
Provides-Extra: dev
|
|
12
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
13
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
14
|
+
Dynamic: license-file
|
|
15
|
+
|
|
16
|
+
# detect-file-type-local
|
|
17
|
+
|
|
18
|
+
[](https://github.com/pgeraghty/openclaw-detect-file-type-local/actions/workflows/ci.yml)
|
|
19
|
+
[](LICENSE)
|
|
20
|
+

|
|
21
|
+

|
|
22
|
+

|
|
23
|
+
|
|
24
|
+
An [OpenClaw](https://openclaw.org) skill for AI-powered local file type detection.
|
|
25
|
+
|
|
26
|
+
Wraps [Google Magika](https://github.com/google/magika) to provide ML-based file type identification that runs entirely offline. No API keys, no network calls — just local inference on an embedded ONNX model.
|
|
27
|
+
|
|
28
|
+
## Features
|
|
29
|
+
|
|
30
|
+
- **214 file types** detected by content, not extension
|
|
31
|
+
- **Fully offline** — no network access required
|
|
32
|
+
- **Fast** — only reads the bytes needed for classification
|
|
33
|
+
- **Batch support** — process multiple files or entire directories
|
|
34
|
+
- **Multiple output formats** — JSON, human-readable, bare MIME type
|
|
35
|
+
- **Stdin support** — pipe content directly
|
|
36
|
+
|
|
37
|
+
## Quick Start
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install -e .
|
|
41
|
+
|
|
42
|
+
# Detect a single file
|
|
43
|
+
detect-file-type-local document.pdf
|
|
44
|
+
|
|
45
|
+
# Batch detect
|
|
46
|
+
detect-file-type-local --human *.pdf *.png
|
|
47
|
+
|
|
48
|
+
# Recursive directory scan
|
|
49
|
+
detect-file-type-local -r ./uploads/
|
|
50
|
+
|
|
51
|
+
# Pipe from stdin
|
|
52
|
+
cat mystery_file | detect-file-type-local -
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
Compatibility alias: `detect-file-type` remains available.
|
|
56
|
+
|
|
57
|
+
## Output Formats
|
|
58
|
+
|
|
59
|
+
**JSON (default):**
|
|
60
|
+
```json
|
|
61
|
+
{
|
|
62
|
+
"path": "photo.jpg",
|
|
63
|
+
"label": "jpeg",
|
|
64
|
+
"mime_type": "image/jpeg",
|
|
65
|
+
"score": 0.99,
|
|
66
|
+
"group": "image",
|
|
67
|
+
"description": "JPEG image",
|
|
68
|
+
"is_text": false
|
|
69
|
+
}
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
**Human-readable:**
|
|
73
|
+
```
|
|
74
|
+
photo.jpg: JPEG image (image/jpeg) [score: 0.99]
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
**MIME-only:**
|
|
78
|
+
```
|
|
79
|
+
image/jpeg
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## OpenClaw Skill
|
|
83
|
+
|
|
84
|
+
See [SKILL.md](SKILL.md) for the OpenClaw skill definition, including structured output schemas and usage guidance for LLM integration.
|
|
85
|
+
|
|
86
|
+
Note: this skill currently uses manual local installation (`pip install -e ...`). Auto-install metadata will be added after a public package artifact is published and resolvable.
|
|
87
|
+
|
|
88
|
+
## Development
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
pip install -e '.[dev]'
|
|
92
|
+
pytest tests/ -v
|
|
93
|
+
ruff check .
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Release
|
|
97
|
+
|
|
98
|
+
PyPI publishing is automated via GitHub Actions (`Publish Python Package` workflow):
|
|
99
|
+
|
|
100
|
+
1. Create a GitHub release with a tag matching package version (for example, `v0.1.0`)
|
|
101
|
+
2. Workflow builds and validates artifacts
|
|
102
|
+
3. Workflow publishes to PyPI via trusted publishing
|
|
103
|
+
|
|
104
|
+
After PyPI release, update and republish the ClawHub skill metadata to enable auto-install from `detect-file-type-local`.
|
|
105
|
+
|
|
106
|
+
## License
|
|
107
|
+
|
|
108
|
+
MIT — see [LICENSE](LICENSE).
|
|
109
|
+
|
|
110
|
+
This project uses [Google Magika](https://github.com/google/magika) (Apache-2.0). See [NOTICE](NOTICE) and [THIRD_PARTY_LICENSES.md](THIRD_PARTY_LICENSES.md).
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
NOTICE
|
|
3
|
+
README.md
|
|
4
|
+
pyproject.toml
|
|
5
|
+
detect_file_type/__init__.py
|
|
6
|
+
detect_file_type/__main__.py
|
|
7
|
+
detect_file_type/cli.py
|
|
8
|
+
detect_file_type/formatter.py
|
|
9
|
+
detect_file_type_local.egg-info/PKG-INFO
|
|
10
|
+
detect_file_type_local.egg-info/SOURCES.txt
|
|
11
|
+
detect_file_type_local.egg-info/dependency_links.txt
|
|
12
|
+
detect_file_type_local.egg-info/entry_points.txt
|
|
13
|
+
detect_file_type_local.egg-info/requires.txt
|
|
14
|
+
detect_file_type_local.egg-info/top_level.txt
|
|
15
|
+
tests/test_cli.py
|
|
16
|
+
tests/test_detection.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
detect_file_type
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "detect-file-type-local"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Security-focused local file type detection powered by Google Magika"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.8"
|
|
12
|
+
dependencies = [
|
|
13
|
+
"magika>=1.0.0,<2.0.0",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
[project.optional-dependencies]
|
|
17
|
+
dev = [
|
|
18
|
+
"pytest>=7.0",
|
|
19
|
+
"ruff>=0.4",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[project.scripts]
|
|
23
|
+
detect-file-type-local = "detect_file_type.cli:main"
|
|
24
|
+
detect-file-type = "detect_file_type.cli:main"
|
|
25
|
+
|
|
26
|
+
[tool.ruff]
|
|
27
|
+
target-version = "py38"
|
|
28
|
+
line-length = 100
|
|
29
|
+
|
|
30
|
+
[tool.ruff.lint]
|
|
31
|
+
select = ["E", "F", "W", "I"]
|
|
32
|
+
|
|
33
|
+
[tool.pytest.ini_options]
|
|
34
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
"""CLI integration tests."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import subprocess
|
|
7
|
+
import sys
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
FIXTURES_DIR = Path(__file__).parent / "fixtures"
|
|
11
|
+
CLI_MODULE = [sys.executable, "-m", "detect_file_type"]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def run_cli(*args: str, stdin_data: bytes | None = None) -> subprocess.CompletedProcess:
|
|
15
|
+
return subprocess.run(
|
|
16
|
+
[*CLI_MODULE, *args],
|
|
17
|
+
capture_output=True,
|
|
18
|
+
text=stdin_data is None,
|
|
19
|
+
input=stdin_data if stdin_data is not None else None,
|
|
20
|
+
timeout=60,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def run_cli_text(*args: str) -> subprocess.CompletedProcess:
|
|
25
|
+
return subprocess.run(
|
|
26
|
+
[*CLI_MODULE, *args],
|
|
27
|
+
capture_output=True,
|
|
28
|
+
text=True,
|
|
29
|
+
timeout=60,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class TestJsonOutput:
|
|
34
|
+
def test_single_file_json(self):
|
|
35
|
+
result = run_cli_text(str(FIXTURES_DIR / "sample.png"))
|
|
36
|
+
assert result.returncode == 0
|
|
37
|
+
data = json.loads(result.stdout)
|
|
38
|
+
assert data["label"] == "png"
|
|
39
|
+
assert data["mime_type"] == "image/png"
|
|
40
|
+
assert isinstance(data["score"], float)
|
|
41
|
+
assert data["group"] == "image"
|
|
42
|
+
assert isinstance(data["is_text"], bool)
|
|
43
|
+
|
|
44
|
+
def test_multiple_files_json(self):
|
|
45
|
+
result = run_cli_text(
|
|
46
|
+
str(FIXTURES_DIR / "tiny.txt"),
|
|
47
|
+
str(FIXTURES_DIR / "sample.png"),
|
|
48
|
+
)
|
|
49
|
+
assert result.returncode == 0
|
|
50
|
+
data = json.loads(result.stdout)
|
|
51
|
+
assert isinstance(data, list)
|
|
52
|
+
assert len(data) == 2
|
|
53
|
+
|
|
54
|
+
def test_json_output_parseable(self):
|
|
55
|
+
result = run_cli_text(str(FIXTURES_DIR / "sample.zip"))
|
|
56
|
+
data = json.loads(result.stdout)
|
|
57
|
+
required_keys = {"path", "label", "mime_type", "score", "group", "description", "is_text"}
|
|
58
|
+
assert required_keys.issubset(data.keys())
|
|
59
|
+
|
|
60
|
+
def test_duplicate_paths_preserve_order(self):
|
|
61
|
+
tiny = str(FIXTURES_DIR / "tiny.txt")
|
|
62
|
+
png = str(FIXTURES_DIR / "sample.png")
|
|
63
|
+
result = run_cli_text(tiny, png, tiny)
|
|
64
|
+
assert result.returncode == 0
|
|
65
|
+
data = json.loads(result.stdout)
|
|
66
|
+
assert [item["path"] for item in data] == [tiny, png, tiny]
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class TestHumanOutput:
|
|
70
|
+
def test_human_format(self):
|
|
71
|
+
result = run_cli_text("--human", str(FIXTURES_DIR / "sample.png"))
|
|
72
|
+
assert result.returncode == 0
|
|
73
|
+
assert "image/png" in result.stdout
|
|
74
|
+
assert "score:" in result.stdout
|
|
75
|
+
|
|
76
|
+
def test_human_batch(self):
|
|
77
|
+
result = run_cli_text(
|
|
78
|
+
"--human",
|
|
79
|
+
str(FIXTURES_DIR / "tiny.txt"),
|
|
80
|
+
str(FIXTURES_DIR / "sample.png"),
|
|
81
|
+
)
|
|
82
|
+
assert result.returncode == 0
|
|
83
|
+
lines = result.stdout.strip().split("\n")
|
|
84
|
+
assert len(lines) == 2
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class TestMimeOutput:
|
|
88
|
+
def test_mime_format(self):
|
|
89
|
+
result = run_cli_text("--mime", str(FIXTURES_DIR / "sample.png"))
|
|
90
|
+
assert result.returncode == 0
|
|
91
|
+
assert result.stdout.strip() == "image/png"
|
|
92
|
+
|
|
93
|
+
def test_mime_batch(self):
|
|
94
|
+
result = run_cli_text(
|
|
95
|
+
"--mime",
|
|
96
|
+
str(FIXTURES_DIR / "tiny.txt"),
|
|
97
|
+
str(FIXTURES_DIR / "sample.png"),
|
|
98
|
+
)
|
|
99
|
+
lines = result.stdout.strip().split("\n")
|
|
100
|
+
assert len(lines) == 2
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class TestErrorHandling:
|
|
104
|
+
def test_nonexistent_file(self):
|
|
105
|
+
result = run_cli_text(str(FIXTURES_DIR / "does_not_exist.xyz"))
|
|
106
|
+
assert result.returncode != 0
|
|
107
|
+
assert "No such file" in result.stderr
|
|
108
|
+
|
|
109
|
+
def test_empty_file(self):
|
|
110
|
+
result = run_cli_text(str(FIXTURES_DIR / "empty.bin"))
|
|
111
|
+
assert result.returncode == 0
|
|
112
|
+
data = json.loads(result.stdout)
|
|
113
|
+
assert data["label"] == "empty"
|
|
114
|
+
|
|
115
|
+
def test_partial_failure_exit_code(self):
|
|
116
|
+
result = run_cli_text(
|
|
117
|
+
str(FIXTURES_DIR / "sample.png"),
|
|
118
|
+
str(FIXTURES_DIR / "does_not_exist.xyz"),
|
|
119
|
+
)
|
|
120
|
+
# Should exit 2 (partial failure) — some files succeeded, some failed
|
|
121
|
+
assert result.returncode == 2
|
|
122
|
+
|
|
123
|
+
def test_partial_failure_preserves_success_order_with_duplicates(self):
|
|
124
|
+
tiny = str(FIXTURES_DIR / "tiny.txt")
|
|
125
|
+
png = str(FIXTURES_DIR / "sample.png")
|
|
126
|
+
missing = str(FIXTURES_DIR / "does_not_exist.xyz")
|
|
127
|
+
|
|
128
|
+
result = run_cli_text(tiny, missing, png, tiny)
|
|
129
|
+
assert result.returncode == 2
|
|
130
|
+
data = json.loads(result.stdout)
|
|
131
|
+
assert [item["path"] for item in data] == [tiny, png, tiny]
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class TestStdin:
|
|
135
|
+
def test_stdin_detection(self):
|
|
136
|
+
result = subprocess.run(
|
|
137
|
+
[*CLI_MODULE, "-"],
|
|
138
|
+
input=b"Hello, this is plain text content for stdin detection.\n" * 20,
|
|
139
|
+
capture_output=True,
|
|
140
|
+
timeout=60,
|
|
141
|
+
)
|
|
142
|
+
assert result.returncode == 0
|
|
143
|
+
data = json.loads(result.stdout)
|
|
144
|
+
assert data["path"] == "-"
|
|
145
|
+
assert data["group"] == "text"
|
|
146
|
+
|
|
147
|
+
def test_multiple_stdin_inputs_are_rejected(self):
|
|
148
|
+
result = run_cli("-", "-", stdin_data=b"hello from stdin\n" * 20)
|
|
149
|
+
assert result.returncode == 1
|
|
150
|
+
stderr = result.stderr.decode("utf-8", errors="replace")
|
|
151
|
+
assert "multiple stdin inputs are not supported" in stderr
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class TestRecursive:
|
|
155
|
+
def test_recursive_directory(self):
|
|
156
|
+
result = run_cli_text("--recursive", "--json", str(FIXTURES_DIR))
|
|
157
|
+
assert result.returncode == 0
|
|
158
|
+
data = json.loads(result.stdout)
|
|
159
|
+
assert isinstance(data, list)
|
|
160
|
+
assert len(data) >= 4 # At least our fixture files
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""Detection accuracy tests with real fixtures."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
from magika import Magika
|
|
9
|
+
|
|
10
|
+
FIXTURES_DIR = Path(__file__).parent / "fixtures"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@pytest.fixture(scope="module")
|
|
14
|
+
def magika_instance():
|
|
15
|
+
return Magika()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def test_detect_text(magika_instance):
|
|
19
|
+
result = magika_instance.identify_path(FIXTURES_DIR / "tiny.txt")
|
|
20
|
+
assert result.output.group == "text"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def test_detect_png(magika_instance):
|
|
24
|
+
result = magika_instance.identify_path(FIXTURES_DIR / "sample.png")
|
|
25
|
+
assert result.output.label == "png"
|
|
26
|
+
assert result.output.mime_type == "image/png"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def test_detect_zip(magika_instance):
|
|
30
|
+
result = magika_instance.identify_path(FIXTURES_DIR / "sample.zip")
|
|
31
|
+
assert result.output.label == "zip"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def test_detect_empty(magika_instance):
|
|
35
|
+
result = magika_instance.identify_path(FIXTURES_DIR / "empty.bin")
|
|
36
|
+
assert result.output.label == "empty"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def test_detect_misleading_extension(magika_instance):
|
|
40
|
+
"""A file with .png extension but text content should detect as text, not PNG."""
|
|
41
|
+
result = magika_instance.identify_path(FIXTURES_DIR / "misleading.txt.png")
|
|
42
|
+
assert result.output.group == "text"
|
|
43
|
+
assert result.output.label != "png"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_identify_bytes(magika_instance):
|
|
47
|
+
data = b"Hello, world! This is a plain text test string.\n" * 20
|
|
48
|
+
result = magika_instance.identify_bytes(data)
|
|
49
|
+
assert result.output.group == "text"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def test_batch_detection(magika_instance):
|
|
53
|
+
paths = [
|
|
54
|
+
FIXTURES_DIR / "tiny.txt",
|
|
55
|
+
FIXTURES_DIR / "sample.png",
|
|
56
|
+
FIXTURES_DIR / "sample.zip",
|
|
57
|
+
]
|
|
58
|
+
results = magika_instance.identify_paths(paths)
|
|
59
|
+
assert len(results) == 3
|
|
60
|
+
assert results[1].output.label == "png"
|