auto-analyser 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Binary file
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Michael Borck
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,13 @@
1
+ Metadata-Version: 2.4
2
+ Name: auto-analyser
3
+ Version: 0.2.0
4
+ Summary: Auto-analyser — detect a file's format and route it to the right analyser family member
5
+ License-File: LICENSE
6
+ Requires-Python: >=3.11
7
+ Requires-Dist: httpx>=0.27.0
8
+ Requires-Dist: pyyaml>=6.0.0
9
+ Requires-Dist: rich>=13.0.0
10
+ Provides-Extra: dev
11
+ Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
12
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
13
+ Requires-Dist: respx>=0.21.0; extra == 'dev'
@@ -0,0 +1,83 @@
1
+ # multi-analyser
2
+
3
+ Routes any file to the right analyser. Detects the file format, calls the appropriate tool, and returns the result — so you don't need to know which analyser handles which format.
4
+
5
+ Part of the [analyser family](#the-analyser-family).
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ pip install multi-analyser
11
+ ```
12
+
13
+ Requires Python 3.11+. The analysers it calls must be installed and reachable separately.
14
+
15
+ ## Usage
16
+
17
+ ### CLI
18
+
19
+ ```bash
20
+ # Detect which analyser would handle a file
21
+ multi-analyser detect report.pdf # report.pdf -> document-analyser
22
+ multi-analyser detect interview.mp3 # interview.mp3 -> speech-analyser
23
+ multi-analyser detect data.xlsx # data.xlsx -> records-analyser
24
+
25
+ # Analyse a file — auto-detects format and routes
26
+ multi-analyser analyse report.pdf
27
+ multi-analyser analyse recording.mp3 --json
28
+
29
+ # Force a specific analyser
30
+ multi-analyser analyse interview.mp4 --analyser speech-analyser
31
+
32
+ # Check which analysers are reachable
33
+ multi-analyser status
34
+ ```
35
+
36
+ ### Python
37
+
38
+ ```python
39
+ from poly_lens import Router
40
+
41
+ router = Router()
42
+ result = router.route("report.pdf")
43
+ print(result["routed_to"]) # "document-analyser"
44
+ ```
45
+
46
+ ## Configuration
47
+
48
+ multi-analyser ships with built-in defaults (document-analyser on `localhost:8000`, speech-analyser via CLI, etc.). Override with a YAML config file at `./multi-analyser.yaml` or `~/.config/multi-analyser/config.yaml`:
49
+
50
+ ```yaml
51
+ lenses:
52
+ document-analyser:
53
+ type: http
54
+ url: http://localhost:8000
55
+ extensions: [.pdf, .docx, .pptx, .txt, .md]
56
+
57
+ speech-analyser:
58
+ type: cli
59
+ command: speech-analyser
60
+ extensions: [.mp3, .wav, .m4a, .ogg, .flac, .mp4, .mov]
61
+
62
+ records-analyser:
63
+ type: http
64
+ url: http://localhost:8003
65
+ extensions: [.csv, .tsv, .xlsx, .parquet, .db, .sqlite]
66
+ ```
67
+
68
+ ## The analyser family
69
+
70
+ Low-level analysis tools. Each accepts files directly and returns structured JSON. Build your own UI or pipeline on top.
71
+
72
+ | Package | Handles |
73
+ |---|---|
74
+ | [speech-analyser](https://github.com/michael-borck/speech-analyser) | audio and video files — transcript and speech metrics |
75
+ | [video-analyser](https://github.com/michael-borck/video-analyser) | video files — frames, scenes, and visual quality |
76
+ | [document-analyser](https://github.com/michael-borck/document-analyser) | PDF, DOCX, PPTX, TXT — text and readability |
77
+ | [code-analyser](https://github.com/michael-borck/code-analyser) | source code — style, complexity, and quality metrics |
78
+ | [records-analyser](https://github.com/michael-borck/records-analyser) | CSV, Excel, SQLite, Parquet, JSON — data profiling |
79
+ | [multi-analyser](https://github.com/michael-borck/multi-analyser) | any file — detects format and routes to the right tool |
80
+
81
+ ## License
82
+
83
+ MIT
@@ -0,0 +1,36 @@
1
+ # poly-lens configuration
2
+ # Copy to ~/.config/poly-lens/config.yaml or ./poly-lens.yaml and adjust.
3
+ #
4
+ # Rate limiting: handled by reverse proxy (nginx, Caddy) for public deployments.
5
+ # Each lens can be run as CLI (default, no server needed) or HTTP (start server first).
6
+
7
+ lenses:
8
+ document-lens:
9
+ type: http
10
+ url: http://localhost:8000
11
+
12
+ # audio-lens: CLI by default (audiolens must be installed).
13
+ # Switch to http if running: audiolens serve --port 8001
14
+ audio-lens:
15
+ type: cli
16
+ command: audiolens
17
+ # audio-lens:
18
+ # type: http
19
+ # url: http://localhost:8001
20
+
21
+ # data-lens: CLI by default (datalens must be installed).
22
+ # Switch to http if running: datalens serve --port 8002
23
+ data-lens:
24
+ type: cli
25
+ command: datalens
26
+ # data-lens:
27
+ # type: http
28
+ # url: http://localhost:8002
29
+
30
+ code-lens:
31
+ type: http
32
+ url: http://localhost:8003
33
+
34
+ video-lens:
35
+ type: cli
36
+ command: videolens
@@ -0,0 +1,31 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "auto-analyser"
7
+ version = "0.2.0"
8
+ description = "Auto-analyser — detect a file's format and route it to the right analyser family member"
9
+ requires-python = ">=3.11"
10
+ dependencies = [
11
+ "httpx>=0.27.0",
12
+ "PyYAML>=6.0.0",
13
+ "rich>=13.0.0",
14
+ ]
15
+
16
+ [project.optional-dependencies]
17
+ dev = [
18
+ "pytest>=8.0.0",
19
+ "pytest-cov>=4.0.0",
20
+ "respx>=0.21.0",
21
+ ]
22
+
23
+ [project.scripts]
24
+ auto-analyser = "auto_analyser.cli:main"
25
+
26
+ [tool.hatch.build.targets.wheel]
27
+ packages = ["src/auto_analyser"]
28
+
29
+ [tool.pytest.ini_options]
30
+ testpaths = ["tests"]
31
+ pythonpath = ["src"]
@@ -0,0 +1,3 @@
1
+ from .router import Router
2
+
3
+ __all__ = ["Router"]
@@ -0,0 +1,125 @@
1
+ """auto-analyser CLI — file analysis router.
2
+
3
+ Usage:
4
+ auto-analyser report.pdf
5
+ auto-analyser data.csv --analyser records-analyser
6
+ auto-analyser recording.mp3 --json
7
+ auto-analyser detect notebook.ipynb
8
+ auto-analyser status
9
+ """
10
+
11
+ import json
12
+ import sys
13
+ from pathlib import Path
14
+
15
+
16
+ def main() -> None:
17
+ import argparse
18
+
19
+ parser = argparse.ArgumentParser(
20
+ prog="auto-analyser",
21
+ description="Route files to the right analyser",
22
+ )
23
+ sub = parser.add_subparsers(dest="command", required=True)
24
+
25
+ analyse = sub.add_parser("analyse", help="Analyse a file")
26
+ analyse.add_argument("file", type=Path, help="File to analyse")
27
+ analyse.add_argument("--analyser", help="Force a specific analyser (e.g. code-analyser)")
28
+ analyse.add_argument("--json", action="store_true", dest="as_json", help="Output raw JSON")
29
+
30
+ detect_cmd = sub.add_parser("detect", help="Show which analyser would handle a file")
31
+ detect_cmd.add_argument("file", type=Path)
32
+
33
+ sub.add_parser("status", help="Show configured analysers and whether they are reachable")
34
+
35
+ args = parser.parse_args()
36
+
37
+ if args.command == "analyse":
38
+ _cmd_analyse(args)
39
+ elif args.command == "detect":
40
+ _cmd_detect(args)
41
+ elif args.command == "status":
42
+ _cmd_status()
43
+
44
+
45
+ def _cmd_analyse(args) -> None:
46
+ from .router import Router, RoutingError
47
+
48
+ router = Router()
49
+
50
+ try:
51
+ result = router.route(args.file, analyser_name=args.analyser)
52
+ except RoutingError as e:
53
+ if args.as_json:
54
+ print(json.dumps({"error": str(e)}, indent=2, default=str), file=sys.stderr)
55
+ else:
56
+ print(f"Error: {e}", file=sys.stderr)
57
+ sys.exit(1)
58
+
59
+ if args.as_json:
60
+ print(json.dumps(result, indent=2, default=str))
61
+ return
62
+
63
+ if result.get("warning"):
64
+ print(f"Note: {result['warning']}\n")
65
+
66
+ print(f"Routed to: {result.get('routed_to', 'unknown')}")
67
+ print()
68
+ print("Full result (use --json for machine-readable output):")
69
+ _print_summary({k: v for k, v in result.items() if k not in ("routed_to", "warning")})
70
+
71
+
72
+ def _cmd_detect(args) -> None:
73
+ from .detector import detect
74
+
75
+ result = detect(args.file)
76
+ if result.warning:
77
+ print(f"Note: {result.warning}")
78
+ if result.analyser:
79
+ print(f"{args.file.name} -> {result.analyser}")
80
+ else:
81
+ print(f"{args.file.name} -> unknown (no analyser configured for {result.extension})")
82
+ sys.exit(1)
83
+
84
+
85
+ def _cmd_status() -> None:
86
+ from .config import load_config
87
+ import httpx
88
+
89
+ config = load_config()
90
+ print("Configured analysers:\n")
91
+
92
+ for name, cfg in config.analysers.items():
93
+ if cfg.type == "http":
94
+ try:
95
+ httpx.get(f"{cfg.url}/health", timeout=3).raise_for_status()
96
+ status = "reachable"
97
+ except Exception:
98
+ status = "not reachable"
99
+ print(f" {name:<22} http {cfg.url} {status}")
100
+ else:
101
+ import subprocess
102
+ try:
103
+ subprocess.run([cfg.command, "--version"], capture_output=True, timeout=5)
104
+ status = "installed"
105
+ except FileNotFoundError:
106
+ status = "not found"
107
+ except subprocess.TimeoutExpired:
108
+ status = "installed (timeout on --version)"
109
+ print(f" {name:<22} cli {cfg.command} {status}")
110
+
111
+
112
+ def _print_summary(data: dict) -> None:
113
+ for key, value in data.items():
114
+ if isinstance(value, dict):
115
+ print(f" {key}:")
116
+ for k, v in value.items():
117
+ print(f" {k}: {v}")
118
+ elif isinstance(value, list):
119
+ print(f" {key}: [{len(value)} items]")
120
+ else:
121
+ print(f" {key}: {value}")
122
+
123
+
124
+ if __name__ == "__main__":
125
+ main()
@@ -0,0 +1,67 @@
1
+ from dataclasses import dataclass, field
2
+ from pathlib import Path
3
+ from typing import Literal
4
+
5
+ import yaml
6
+
7
+
8
+ @dataclass
9
+ class AnalyserConfig:
10
+ type: Literal["http", "cli"]
11
+ url: str | None = None # for type=http
12
+ command: str | None = None # for type=cli
13
+ formats: list[str] = field(default_factory=list)
14
+
15
+
16
+ @dataclass
17
+ class FamilyConfig:
18
+ analysers: dict[str, AnalyserConfig]
19
+
20
+ def get(self, analyser_name: str) -> AnalyserConfig | None:
21
+ return self.analysers.get(analyser_name)
22
+
23
+ def available(self) -> list[str]:
24
+ return list(self.analysers.keys())
25
+
26
+
27
+ _DEFAULTS: dict[str, dict] = {
28
+ "document-analyser": {"type": "http", "url": "http://localhost:8000"},
29
+ "speech-analyser": {"type": "http", "url": "http://localhost:8001"},
30
+ "video-analyser": {"type": "http", "url": "http://localhost:8002"},
31
+ "records-analyser": {"type": "http", "url": "http://localhost:8003"},
32
+ "code-analyser": {"type": "http", "url": "http://localhost:8004"},
33
+ "wordpress-analyser": {"type": "http", "url": "http://localhost:8005"},
34
+ }
35
+
36
+ _CONFIG_PATHS = [
37
+ Path("auto-analyser.yaml"),
38
+ Path.home() / ".config" / "auto-analyser" / "config.yaml",
39
+ ]
40
+
41
+
42
+ def load_config() -> FamilyConfig:
43
+ """Load family config. Falls back to built-in defaults if no file found."""
44
+ raw: dict = {}
45
+
46
+ for path in _CONFIG_PATHS:
47
+ if path.exists():
48
+ with open(path) as f:
49
+ raw = yaml.safe_load(f) or {}
50
+ break
51
+
52
+ merged = {**_DEFAULTS, **raw.get("analysers", {})}
53
+ analysers = {}
54
+ for name, cfg in merged.items():
55
+ if "type" not in cfg:
56
+ raise ValueError(
57
+ f"Analyser '{name}' in config is missing required field 'type'. "
58
+ f"Must be 'http' or 'cli'."
59
+ )
60
+ analysers[name] = AnalyserConfig(
61
+ type=cfg["type"],
62
+ url=cfg.get("url"),
63
+ command=cfg.get("command"),
64
+ formats=cfg.get("formats", []),
65
+ )
66
+
67
+ return FamilyConfig(analysers=analysers)
@@ -0,0 +1,95 @@
1
+ from dataclasses import dataclass
2
+ from pathlib import Path
3
+
4
+
5
+ @dataclass
6
+ class DetectionResult:
7
+ analyser: str | None # None if unknown
8
+ extension: str
9
+ warning: str | None = None
10
+
11
+
12
+ # Extension → analyser name
13
+ _ROUTES: dict[str, str] = {
14
+ # document-analyser
15
+ ".pdf": "document-analyser",
16
+ ".docx": "document-analyser",
17
+ ".pptx": "document-analyser",
18
+ ".txt": "document-analyser",
19
+ ".md": "document-analyser",
20
+ ".qmd": "document-analyser",
21
+ ".rst": "document-analyser",
22
+ # speech-analyser
23
+ ".mp3": "speech-analyser",
24
+ ".wav": "speech-analyser",
25
+ ".m4a": "speech-analyser",
26
+ ".ogg": "speech-analyser",
27
+ ".flac": "speech-analyser",
28
+ ".aac": "speech-analyser",
29
+ ".opus": "speech-analyser",
30
+ # records-analyser
31
+ ".csv": "records-analyser",
32
+ ".tsv": "records-analyser",
33
+ ".xlsx": "records-analyser",
34
+ ".xls": "records-analyser",
35
+ ".parquet": "records-analyser",
36
+ ".sqlite": "records-analyser",
37
+ ".db": "records-analyser",
38
+ ".sqlite3": "records-analyser",
39
+ # records-analyser (ambiguous)
40
+ ".json": "records-analyser",
41
+ ".yaml": "records-analyser",
42
+ ".yml": "records-analyser",
43
+ ".xml": "records-analyser",
44
+ # code-analyser
45
+ ".py": "code-analyser",
46
+ ".js": "code-analyser",
47
+ ".ts": "code-analyser",
48
+ ".tsx": "code-analyser",
49
+ ".jsx": "code-analyser",
50
+ ".html": "code-analyser",
51
+ ".css": "code-analyser",
52
+ ".scss": "code-analyser",
53
+ ".sql": "code-analyser",
54
+ ".ipynb": "code-analyser",
55
+ # video-analyser
56
+ ".mp4": "video-analyser",
57
+ ".mov": "video-analyser",
58
+ ".avi": "video-analyser",
59
+ ".webm": "video-analyser",
60
+ ".mkv": "video-analyser",
61
+ # wordpress-analyser
62
+ ".php": "wordpress-analyser",
63
+ }
64
+
65
+ _AMBIGUOUS_WARNING = (
66
+ "{ext} files may be configuration data or structured datasets. "
67
+ "auto-analyser is routing to records-analyser. "
68
+ "For prose content, use document-analyser directly."
69
+ )
70
+
71
+ _NOTEBOOK_WARNING = (
72
+ "{ext} is a notebook format containing both code and prose. "
73
+ "code-analyser will analyse the code cells. "
74
+ "Pass extracted prose to document-analyser for writing quality analysis."
75
+ )
76
+
77
+
78
+ def detect(file_path: Path) -> DetectionResult:
79
+ """Detect which analyser should handle this file."""
80
+ ext = file_path.suffix.lower()
81
+ analyser = _ROUTES.get(ext)
82
+
83
+ warning = None
84
+ if ext in {".json", ".yaml", ".yml", ".xml"}:
85
+ warning = _AMBIGUOUS_WARNING.format(ext=ext.upper())
86
+ elif ext in {".ipynb", ".qmd", ".rmd"}:
87
+ warning = _NOTEBOOK_WARNING.format(ext=ext)
88
+ elif analyser is None:
89
+ warning = (
90
+ f"Unknown format: {ext}. "
91
+ f"auto-analyser does not know which analyser handles this file. "
92
+ f"Use an analyser directly or add a mapping to your auto-analyser.yaml."
93
+ )
94
+
95
+ return DetectionResult(analyser=analyser, extension=ext, warning=warning)
@@ -0,0 +1,132 @@
1
+ import json
2
+ import subprocess
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ import httpx
7
+
8
+ from .config import FamilyConfig, load_config
9
+ from .detector import detect
10
+
11
+
12
+ class RoutingError(Exception):
13
+ """Raised when auto-analyser cannot route or analyse a file."""
14
+
15
+
16
+ class Router:
17
+ """Routes a file to the appropriate analyser and returns the analysis."""
18
+
19
+ def __init__(self, config: FamilyConfig | None = None) -> None:
20
+ self._config = config or load_config()
21
+
22
+ def route(
23
+ self,
24
+ file_path: "Path | str",
25
+ analyser_name: str | None = None,
26
+ ) -> dict[str, Any]:
27
+ """Analyse a file by routing to the appropriate analyser.
28
+
29
+ Returns the analysis dict with a 'routed_to' key injected.
30
+
31
+ Raises:
32
+ RoutingError: if the file is missing, format unknown, analyser not
33
+ configured, or the analyser returns an error.
34
+ """
35
+ if isinstance(file_path, str):
36
+ file_path = Path(file_path)
37
+
38
+ if not file_path.exists():
39
+ raise RoutingError(f"File not found: {file_path}")
40
+ if not file_path.is_file():
41
+ raise RoutingError(f"Not a file: {file_path}")
42
+
43
+ warning = None
44
+
45
+ if analyser_name is None:
46
+ detection = detect(file_path)
47
+ if detection.analyser is None:
48
+ raise RoutingError(
49
+ f"Unknown format: {file_path.suffix}. "
50
+ f"Use --analyser to specify an analyser directly."
51
+ )
52
+ analyser_name = detection.analyser
53
+ warning = detection.warning
54
+
55
+ analyser_cfg = self._config.get(analyser_name)
56
+ if analyser_cfg is None:
57
+ raise RoutingError(
58
+ f"Analyser '{analyser_name}' is not configured. "
59
+ f"Available: {self._config.available()}"
60
+ )
61
+
62
+ if analyser_cfg.type == "cli":
63
+ if not analyser_cfg.command:
64
+ raise RoutingError(
65
+ f"Analyser '{analyser_name}' has type=cli but no command configured."
66
+ )
67
+ data = self._call_cli(analyser_cfg.command, file_path)
68
+ elif analyser_cfg.type == "http":
69
+ if not analyser_cfg.url:
70
+ raise RoutingError(
71
+ f"Analyser '{analyser_name}' has type=http but no url configured."
72
+ )
73
+ data = self._call_http(analyser_cfg.url, file_path)
74
+ else:
75
+ raise RoutingError(f"Unknown analyser type: {analyser_cfg.type}")
76
+
77
+ data["routed_to"] = analyser_name
78
+ if warning:
79
+ data["warning"] = warning
80
+ return data
81
+
82
+ def _call_cli(self, command: str, file_path: Path) -> dict[str, Any]:
83
+ try:
84
+ proc = subprocess.run(
85
+ [command, str(file_path), "--json"],
86
+ capture_output=True,
87
+ text=True,
88
+ timeout=300,
89
+ )
90
+ if proc.returncode != 0:
91
+ try:
92
+ err = json.loads(proc.stderr)
93
+ msg = err.get("error", proc.stderr.strip())
94
+ except (json.JSONDecodeError, AttributeError):
95
+ msg = proc.stderr.strip() or f"{command} exited with code {proc.returncode}"
96
+ raise RoutingError(msg)
97
+ try:
98
+ return json.loads(proc.stdout)
99
+ except json.JSONDecodeError as e:
100
+ raise RoutingError(
101
+ f"{command} returned invalid JSON: {e}. "
102
+ f"stdout={proc.stdout[:200]!r}"
103
+ )
104
+ except FileNotFoundError:
105
+ raise RoutingError(f"CLI tool '{command}' not found. Is it installed?")
106
+ except RoutingError:
107
+ raise
108
+ except Exception as e:
109
+ raise RoutingError(str(e)) from e
110
+
111
+ def _call_http(self, url: str, file_path: Path) -> dict[str, Any]:
112
+ """POST file to {url}/analyse."""
113
+ try:
114
+ with open(file_path, "rb") as f:
115
+ with httpx.Client(timeout=300) as client:
116
+ response = client.post(
117
+ f"{url}/analyse",
118
+ files={"file": (file_path.name, f)},
119
+ )
120
+ if not response.is_success:
121
+ try:
122
+ detail = response.json().get("detail", response.text)
123
+ except Exception:
124
+ detail = response.text
125
+ raise RoutingError(f"HTTP {response.status_code}: {detail}")
126
+ return response.json()
127
+ except httpx.ConnectError:
128
+ raise RoutingError(f"Cannot connect to {url}. Is the service running?")
129
+ except RoutingError:
130
+ raise
131
+ except Exception as e:
132
+ raise RoutingError(str(e)) from e
@@ -0,0 +1,3 @@
1
+ from .router import Router
2
+
3
+ __all__ = ["Router"]