iil-task-scorer 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- iil_task_scorer-0.1.0/PKG-INFO +113 -0
- iil_task_scorer-0.1.0/README.md +101 -0
- iil_task_scorer-0.1.0/pyproject.toml +33 -0
- iil_task_scorer-0.1.0/setup.cfg +4 -0
- iil_task_scorer-0.1.0/src/iil_task_scorer.egg-info/PKG-INFO +113 -0
- iil_task_scorer-0.1.0/src/iil_task_scorer.egg-info/SOURCES.txt +11 -0
- iil_task_scorer-0.1.0/src/iil_task_scorer.egg-info/dependency_links.txt +1 -0
- iil_task_scorer-0.1.0/src/iil_task_scorer.egg-info/requires.txt +5 -0
- iil_task_scorer-0.1.0/src/iil_task_scorer.egg-info/top_level.txt +1 -0
- iil_task_scorer-0.1.0/src/task_scorer/__init__.py +33 -0
- iil_task_scorer-0.1.0/src/task_scorer/scorer.py +216 -0
- iil_task_scorer-0.1.0/src/task_scorer/types.py +139 -0
- iil_task_scorer-0.1.0/tests/test_scorer.py +322 -0
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: iil-task-scorer
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Shared scoring and routing engine for task complexity estimation
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Provides-Extra: dev
|
|
9
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
10
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
11
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
12
|
+
|
|
13
|
+
# task-scorer
|
|
14
|
+
|
|
15
|
+
Shared scoring and routing engine for task complexity estimation.
|
|
16
|
+
|
|
17
|
+
Zero-dependency Python package that provides weighted multi-dimension
|
|
18
|
+
keyword scoring with sigmoid confidence calibration. Consolidates
|
|
19
|
+
scoring logic from BFAgent (TestRequirement, LLMRouter) and
|
|
20
|
+
Orchestrator MCP (analyzer).
|
|
21
|
+
|
|
22
|
+
**ADR**: [ADR-023 Shared Scoring and Routing Engine](../../docs/adr/ADR-023-shared-scoring-routing-engine.md)
|
|
23
|
+
|
|
24
|
+
## Installation
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
# From git (recommended for Docker)
|
|
28
|
+
pip install "task-scorer @ git+ssh://git@github.com/achimdehnert/platform.git@main#subdirectory=packages/task_scorer"
|
|
29
|
+
|
|
30
|
+
# Local development
|
|
31
|
+
pip install -e ".[dev]"
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Usage
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
from task_scorer import score_task, ScoringConfig, Tier
|
|
38
|
+
|
|
39
|
+
# With defaults
|
|
40
|
+
result = score_task("fix the authentication bug in the API")
|
|
41
|
+
print(result.top_type) # "security"
|
|
42
|
+
print(result.tier) # Tier.HIGH
|
|
43
|
+
print(result.confidence) # 0.87
|
|
44
|
+
print(result.signals) # ["security(auth)", "bug(fix)"]
|
|
45
|
+
print(result.is_ambiguous) # False
|
|
46
|
+
|
|
47
|
+
# With custom config (e.g. from DB lookup tables)
|
|
48
|
+
config = ScoringConfig(
|
|
49
|
+
keywords={"security": ["auth", "cve", "credential"]},
|
|
50
|
+
weights={"security": 2.0},
|
|
51
|
+
tier_boundaries=(0.5, 1.5),
|
|
52
|
+
)
|
|
53
|
+
result = score_task("check auth flow", config=config)
|
|
54
|
+
|
|
55
|
+
# With structured metadata
|
|
56
|
+
result = score_task(
|
|
57
|
+
"refactor the authentication module",
|
|
58
|
+
category="security",
|
|
59
|
+
acceptance_criteria_count=5,
|
|
60
|
+
files_affected=8,
|
|
61
|
+
)
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## API
|
|
65
|
+
|
|
66
|
+
### `score_task(description, config=None, category=None, acceptance_criteria_count=0, files_affected=0) -> ScoringResult`
|
|
67
|
+
|
|
68
|
+
Main entry point. Scores a task description against all configured
|
|
69
|
+
task types using weighted keyword matching.
|
|
70
|
+
|
|
71
|
+
### `ScoringResult`
|
|
72
|
+
|
|
73
|
+
| Field | Type | Description |
|
|
74
|
+
|-------|------|-------------|
|
|
75
|
+
| `scores` | `dict[str, float]` | All type scores |
|
|
76
|
+
| `top_type` | `str` | Highest scoring type |
|
|
77
|
+
| `tier` | `Tier` | LOW / MEDIUM / HIGH |
|
|
78
|
+
| `confidence` | `float` | Sigmoid confidence [0, 1] |
|
|
79
|
+
| `signals` | `list[str]` | Debug signals |
|
|
80
|
+
| `is_ambiguous` | `bool` | True if confidence < threshold |
|
|
81
|
+
| `raw_score` | `float` | Winner's weighted score |
|
|
82
|
+
|
|
83
|
+
### `ScoringConfig`
|
|
84
|
+
|
|
85
|
+
| Field | Type | Default | Description |
|
|
86
|
+
|-------|------|---------|-------------|
|
|
87
|
+
| `keywords` | `dict[str, list[str]]` | 10 types, 85 keywords | Task type keywords |
|
|
88
|
+
| `weights` | `dict[str, float]` | 0.5 - 1.5 | Type weight multipliers |
|
|
89
|
+
| `tier_boundaries` | `tuple[float, float]` | (1.0, 4.0) | LOW/MEDIUM/HIGH boundaries |
|
|
90
|
+
| `confidence_steepness` | `float` | 8.0 | Sigmoid steepness |
|
|
91
|
+
| `confidence_threshold` | `float` | 0.65 | Ambiguity threshold |
|
|
92
|
+
|
|
93
|
+
## Testing
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
cd packages/task_scorer
|
|
97
|
+
pip install -e ".[dev]"
|
|
98
|
+
python3 -m pytest tests/ -v
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## Architecture
|
|
102
|
+
|
|
103
|
+
```
|
|
104
|
+
src/task_scorer/
|
|
105
|
+
├── __init__.py # Public API exports
|
|
106
|
+
├── types.py # ScoringConfig, ScoringResult, Tier, DEFAULT_KEYWORDS
|
|
107
|
+
└── scorer.py # score_task, _score_all_types, _sigmoid_confidence
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
- **Zero dependencies** — stdlib only (math, dataclasses, enum)
|
|
111
|
+
- **Frozen dataclasses** — immutable config and results
|
|
112
|
+
- **Config injection** — defaults in code, DB-driven override via ScoringConfig
|
|
113
|
+
- **Tenant-agnostic** — pure function, no database access
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# task-scorer
|
|
2
|
+
|
|
3
|
+
Shared scoring and routing engine for task complexity estimation.
|
|
4
|
+
|
|
5
|
+
Zero-dependency Python package that provides weighted multi-dimension
|
|
6
|
+
keyword scoring with sigmoid confidence calibration. Consolidates
|
|
7
|
+
scoring logic from BFAgent (TestRequirement, LLMRouter) and
|
|
8
|
+
Orchestrator MCP (analyzer).
|
|
9
|
+
|
|
10
|
+
**ADR**: [ADR-023 Shared Scoring and Routing Engine](../../docs/adr/ADR-023-shared-scoring-routing-engine.md)
|
|
11
|
+
|
|
12
|
+
## Installation
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
# From git (recommended for Docker)
|
|
16
|
+
pip install "task-scorer @ git+ssh://git@github.com/achimdehnert/platform.git@main#subdirectory=packages/task_scorer"
|
|
17
|
+
|
|
18
|
+
# Local development
|
|
19
|
+
pip install -e ".[dev]"
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Usage
|
|
23
|
+
|
|
24
|
+
```python
|
|
25
|
+
from task_scorer import score_task, ScoringConfig, Tier
|
|
26
|
+
|
|
27
|
+
# With defaults
|
|
28
|
+
result = score_task("fix the authentication bug in the API")
|
|
29
|
+
print(result.top_type) # "security"
|
|
30
|
+
print(result.tier) # Tier.HIGH
|
|
31
|
+
print(result.confidence) # 0.87
|
|
32
|
+
print(result.signals) # ["security(auth)", "bug(fix)"]
|
|
33
|
+
print(result.is_ambiguous) # False
|
|
34
|
+
|
|
35
|
+
# With custom config (e.g. from DB lookup tables)
|
|
36
|
+
config = ScoringConfig(
|
|
37
|
+
keywords={"security": ["auth", "cve", "credential"]},
|
|
38
|
+
weights={"security": 2.0},
|
|
39
|
+
tier_boundaries=(0.5, 1.5),
|
|
40
|
+
)
|
|
41
|
+
result = score_task("check auth flow", config=config)
|
|
42
|
+
|
|
43
|
+
# With structured metadata
|
|
44
|
+
result = score_task(
|
|
45
|
+
"refactor the authentication module",
|
|
46
|
+
category="security",
|
|
47
|
+
acceptance_criteria_count=5,
|
|
48
|
+
files_affected=8,
|
|
49
|
+
)
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## API
|
|
53
|
+
|
|
54
|
+
### `score_task(description, config=None, category=None, acceptance_criteria_count=0, files_affected=0) -> ScoringResult`
|
|
55
|
+
|
|
56
|
+
Main entry point. Scores a task description against all configured
|
|
57
|
+
task types using weighted keyword matching.
|
|
58
|
+
|
|
59
|
+
### `ScoringResult`
|
|
60
|
+
|
|
61
|
+
| Field | Type | Description |
|
|
62
|
+
|-------|------|-------------|
|
|
63
|
+
| `scores` | `dict[str, float]` | All type scores |
|
|
64
|
+
| `top_type` | `str` | Highest scoring type |
|
|
65
|
+
| `tier` | `Tier` | LOW / MEDIUM / HIGH |
|
|
66
|
+
| `confidence` | `float` | Sigmoid confidence [0, 1] |
|
|
67
|
+
| `signals` | `list[str]` | Debug signals |
|
|
68
|
+
| `is_ambiguous` | `bool` | True if confidence < threshold |
|
|
69
|
+
| `raw_score` | `float` | Winner's weighted score |
|
|
70
|
+
|
|
71
|
+
### `ScoringConfig`
|
|
72
|
+
|
|
73
|
+
| Field | Type | Default | Description |
|
|
74
|
+
|-------|------|---------|-------------|
|
|
75
|
+
| `keywords` | `dict[str, list[str]]` | 10 types, 85 keywords | Task type keywords |
|
|
76
|
+
| `weights` | `dict[str, float]` | 0.5 - 1.5 | Type weight multipliers |
|
|
77
|
+
| `tier_boundaries` | `tuple[float, float]` | (1.0, 4.0) | LOW/MEDIUM/HIGH boundaries |
|
|
78
|
+
| `confidence_steepness` | `float` | 8.0 | Sigmoid steepness |
|
|
79
|
+
| `confidence_threshold` | `float` | 0.65 | Ambiguity threshold |
|
|
80
|
+
|
|
81
|
+
## Testing
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
cd packages/task_scorer
|
|
85
|
+
pip install -e ".[dev]"
|
|
86
|
+
python3 -m pytest tests/ -v
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Architecture
|
|
90
|
+
|
|
91
|
+
```
|
|
92
|
+
src/task_scorer/
|
|
93
|
+
├── __init__.py # Public API exports
|
|
94
|
+
├── types.py # ScoringConfig, ScoringResult, Tier, DEFAULT_KEYWORDS
|
|
95
|
+
└── scorer.py # score_task, _score_all_types, _sigmoid_confidence
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
- **Zero dependencies** — stdlib only (math, dataclasses, enum)
|
|
99
|
+
- **Frozen dataclasses** — immutable config and results
|
|
100
|
+
- **Config injection** — defaults in code, DB-driven override via ScoringConfig
|
|
101
|
+
- **Tenant-agnostic** — pure function, no database access
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "iil-task-scorer"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Shared scoring and routing engine for task complexity estimation"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
# Zero runtime dependencies — stdlib only
|
|
13
|
+
dependencies = []
|
|
14
|
+
|
|
15
|
+
[project.optional-dependencies]
|
|
16
|
+
dev = [
|
|
17
|
+
"pytest>=7.0",
|
|
18
|
+
"pytest-cov>=4.0",
|
|
19
|
+
"ruff>=0.4",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[tool.setuptools.packages.find]
|
|
23
|
+
where = ["src"]
|
|
24
|
+
|
|
25
|
+
[tool.ruff]
|
|
26
|
+
target-version = "py311"
|
|
27
|
+
line-length = 100
|
|
28
|
+
|
|
29
|
+
[tool.ruff.lint]
|
|
30
|
+
select = ["E", "F", "I", "W", "UP", "B", "SIM"]
|
|
31
|
+
|
|
32
|
+
[tool.pytest.ini_options]
|
|
33
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: iil-task-scorer
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Shared scoring and routing engine for task complexity estimation
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Provides-Extra: dev
|
|
9
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
10
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
11
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
12
|
+
|
|
13
|
+
# task-scorer
|
|
14
|
+
|
|
15
|
+
Shared scoring and routing engine for task complexity estimation.
|
|
16
|
+
|
|
17
|
+
Zero-dependency Python package that provides weighted multi-dimension
|
|
18
|
+
keyword scoring with sigmoid confidence calibration. Consolidates
|
|
19
|
+
scoring logic from BFAgent (TestRequirement, LLMRouter) and
|
|
20
|
+
Orchestrator MCP (analyzer).
|
|
21
|
+
|
|
22
|
+
**ADR**: [ADR-023 Shared Scoring and Routing Engine](../../docs/adr/ADR-023-shared-scoring-routing-engine.md)
|
|
23
|
+
|
|
24
|
+
## Installation
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
# From git (recommended for Docker)
|
|
28
|
+
pip install "task-scorer @ git+ssh://git@github.com/achimdehnert/platform.git@main#subdirectory=packages/task_scorer"
|
|
29
|
+
|
|
30
|
+
# Local development
|
|
31
|
+
pip install -e ".[dev]"
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Usage
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
from task_scorer import score_task, ScoringConfig, Tier
|
|
38
|
+
|
|
39
|
+
# With defaults
|
|
40
|
+
result = score_task("fix the authentication bug in the API")
|
|
41
|
+
print(result.top_type) # "security"
|
|
42
|
+
print(result.tier) # Tier.HIGH
|
|
43
|
+
print(result.confidence) # 0.87
|
|
44
|
+
print(result.signals) # ["security(auth)", "bug(fix)"]
|
|
45
|
+
print(result.is_ambiguous) # False
|
|
46
|
+
|
|
47
|
+
# With custom config (e.g. from DB lookup tables)
|
|
48
|
+
config = ScoringConfig(
|
|
49
|
+
keywords={"security": ["auth", "cve", "credential"]},
|
|
50
|
+
weights={"security": 2.0},
|
|
51
|
+
tier_boundaries=(0.5, 1.5),
|
|
52
|
+
)
|
|
53
|
+
result = score_task("check auth flow", config=config)
|
|
54
|
+
|
|
55
|
+
# With structured metadata
|
|
56
|
+
result = score_task(
|
|
57
|
+
"refactor the authentication module",
|
|
58
|
+
category="security",
|
|
59
|
+
acceptance_criteria_count=5,
|
|
60
|
+
files_affected=8,
|
|
61
|
+
)
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## API
|
|
65
|
+
|
|
66
|
+
### `score_task(description, config=None, category=None, acceptance_criteria_count=0, files_affected=0) -> ScoringResult`
|
|
67
|
+
|
|
68
|
+
Main entry point. Scores a task description against all configured
|
|
69
|
+
task types using weighted keyword matching.
|
|
70
|
+
|
|
71
|
+
### `ScoringResult`
|
|
72
|
+
|
|
73
|
+
| Field | Type | Description |
|
|
74
|
+
|-------|------|-------------|
|
|
75
|
+
| `scores` | `dict[str, float]` | All type scores |
|
|
76
|
+
| `top_type` | `str` | Highest scoring type |
|
|
77
|
+
| `tier` | `Tier` | LOW / MEDIUM / HIGH |
|
|
78
|
+
| `confidence` | `float` | Sigmoid confidence [0, 1] |
|
|
79
|
+
| `signals` | `list[str]` | Debug signals |
|
|
80
|
+
| `is_ambiguous` | `bool` | True if confidence < threshold |
|
|
81
|
+
| `raw_score` | `float` | Winner's weighted score |
|
|
82
|
+
|
|
83
|
+
### `ScoringConfig`
|
|
84
|
+
|
|
85
|
+
| Field | Type | Default | Description |
|
|
86
|
+
|-------|------|---------|-------------|
|
|
87
|
+
| `keywords` | `dict[str, list[str]]` | 10 types, 85 keywords | Task type keywords |
|
|
88
|
+
| `weights` | `dict[str, float]` | 0.5 - 1.5 | Type weight multipliers |
|
|
89
|
+
| `tier_boundaries` | `tuple[float, float]` | (1.0, 4.0) | LOW/MEDIUM/HIGH boundaries |
|
|
90
|
+
| `confidence_steepness` | `float` | 8.0 | Sigmoid steepness |
|
|
91
|
+
| `confidence_threshold` | `float` | 0.65 | Ambiguity threshold |
|
|
92
|
+
|
|
93
|
+
## Testing
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
cd packages/task_scorer
|
|
97
|
+
pip install -e ".[dev]"
|
|
98
|
+
python3 -m pytest tests/ -v
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## Architecture
|
|
102
|
+
|
|
103
|
+
```
|
|
104
|
+
src/task_scorer/
|
|
105
|
+
├── __init__.py # Public API exports
|
|
106
|
+
├── types.py # ScoringConfig, ScoringResult, Tier, DEFAULT_KEYWORDS
|
|
107
|
+
└── scorer.py # score_task, _score_all_types, _sigmoid_confidence
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
- **Zero dependencies** — stdlib only (math, dataclasses, enum)
|
|
111
|
+
- **Frozen dataclasses** — immutable config and results
|
|
112
|
+
- **Config injection** — defaults in code, DB-driven override via ScoringConfig
|
|
113
|
+
- **Tenant-agnostic** — pure function, no database access
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
src/iil_task_scorer.egg-info/PKG-INFO
|
|
4
|
+
src/iil_task_scorer.egg-info/SOURCES.txt
|
|
5
|
+
src/iil_task_scorer.egg-info/dependency_links.txt
|
|
6
|
+
src/iil_task_scorer.egg-info/requires.txt
|
|
7
|
+
src/iil_task_scorer.egg-info/top_level.txt
|
|
8
|
+
src/task_scorer/__init__.py
|
|
9
|
+
src/task_scorer/scorer.py
|
|
10
|
+
src/task_scorer/types.py
|
|
11
|
+
tests/test_scorer.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
task_scorer
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Shared Scoring and Routing Engine for task complexity estimation.
|
|
3
|
+
|
|
4
|
+
Zero-dependency Python package that provides weighted multi-dimension
|
|
5
|
+
keyword scoring with sigmoid confidence calibration. Consolidates
|
|
6
|
+
scoring logic from BFAgent (TestRequirement, LLMRouter) and
|
|
7
|
+
Orchestrator MCP (analyzer).
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
from task_scorer import score_task, ScoringConfig
|
|
11
|
+
|
|
12
|
+
# With defaults
|
|
13
|
+
result = score_task("fix the authentication bug in the API")
|
|
14
|
+
print(result.tier) # Tier.HIGH
|
|
15
|
+
print(result.top_type) # "security"
|
|
16
|
+
print(result.confidence) # 0.87
|
|
17
|
+
|
|
18
|
+
# With custom config (e.g. from DB)
|
|
19
|
+
config = ScoringConfig(keywords={"security": ["auth", "cve"]})
|
|
20
|
+
result = score_task("check auth flow", config=config)
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from .scorer import score_task
|
|
24
|
+
from .types import ScoringConfig, ScoringResult, Tier
|
|
25
|
+
|
|
26
|
+
__all__ = [
|
|
27
|
+
"score_task",
|
|
28
|
+
"ScoringConfig",
|
|
29
|
+
"ScoringResult",
|
|
30
|
+
"Tier",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core scoring engine for task complexity estimation.
|
|
3
|
+
|
|
4
|
+
Implements weighted multi-dimension keyword scoring with sigmoid
|
|
5
|
+
confidence calibration. Inspired by ClawRouter's 14-dimension
|
|
6
|
+
scoring approach, adapted for server-side Python use.
|
|
7
|
+
|
|
8
|
+
This module is the single entry point for scoring. It has zero
|
|
9
|
+
external dependencies (stdlib math only).
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import math
|
|
15
|
+
|
|
16
|
+
from .types import ScoringConfig, ScoringResult, Tier
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def score_task(
|
|
20
|
+
description: str,
|
|
21
|
+
config: ScoringConfig | None = None,
|
|
22
|
+
category: str | None = None,
|
|
23
|
+
acceptance_criteria_count: int = 0,
|
|
24
|
+
files_affected: int = 0,
|
|
25
|
+
) -> ScoringResult:
|
|
26
|
+
"""Score a task description and return complexity assessment.
|
|
27
|
+
|
|
28
|
+
This is the main public API. It scores the description against
|
|
29
|
+
all configured task types using weighted keyword matching, then
|
|
30
|
+
maps the result to a complexity tier with confidence.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
description: Task description text to score.
|
|
34
|
+
config: Optional custom scoring config. Uses defaults if None.
|
|
35
|
+
category: Optional task category (e.g. 'security', 'refactor').
|
|
36
|
+
Adds bonus score if it matches a task type.
|
|
37
|
+
acceptance_criteria_count: Number of acceptance criteria.
|
|
38
|
+
More criteria = higher complexity signal.
|
|
39
|
+
files_affected: Number of files likely affected.
|
|
40
|
+
More files = higher complexity signal.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
ScoringResult with scores, tier, confidence, and signals.
|
|
44
|
+
"""
|
|
45
|
+
if config is None:
|
|
46
|
+
config = ScoringConfig()
|
|
47
|
+
|
|
48
|
+
scores, signals = _score_all_types(
|
|
49
|
+
description=description,
|
|
50
|
+
keywords=config.keywords,
|
|
51
|
+
weights=config.weights,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
# Bonus signals from structured metadata
|
|
55
|
+
bonus = _metadata_bonus(
|
|
56
|
+
category=category,
|
|
57
|
+
acceptance_criteria_count=acceptance_criteria_count,
|
|
58
|
+
files_affected=files_affected,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# Apply metadata bonus to matching task type
|
|
62
|
+
if category and category in scores:
|
|
63
|
+
scores[category] += bonus
|
|
64
|
+
if bonus > 0:
|
|
65
|
+
signals.append(f"category_bonus({category}={bonus:.1f})")
|
|
66
|
+
|
|
67
|
+
# Find winner and runner-up
|
|
68
|
+
sorted_types = sorted(scores.items(), key=lambda x: x[1], reverse=True)
|
|
69
|
+
winner_type, winner_score = sorted_types[0]
|
|
70
|
+
runner_up_score = sorted_types[1][1] if len(sorted_types) > 1 else 0.0
|
|
71
|
+
|
|
72
|
+
# Handle no-match case
|
|
73
|
+
if winner_score == 0.0:
|
|
74
|
+
return ScoringResult(
|
|
75
|
+
scores=scores,
|
|
76
|
+
top_type="feature",
|
|
77
|
+
tier=Tier.LOW,
|
|
78
|
+
confidence=0.5,
|
|
79
|
+
signals=["no keywords matched"],
|
|
80
|
+
is_ambiguous=True,
|
|
81
|
+
raw_score=0.0,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Compute confidence via sigmoid on score gap
|
|
85
|
+
confidence = _sigmoid_confidence(
|
|
86
|
+
winner_score=winner_score,
|
|
87
|
+
runner_up_score=runner_up_score,
|
|
88
|
+
steepness=config.confidence_steepness,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Map raw score to tier
|
|
92
|
+
tier = _score_to_tier(winner_score, config.tier_boundaries)
|
|
93
|
+
|
|
94
|
+
# Check ambiguity
|
|
95
|
+
is_ambiguous = confidence < config.confidence_threshold
|
|
96
|
+
|
|
97
|
+
return ScoringResult(
|
|
98
|
+
scores=scores,
|
|
99
|
+
top_type=winner_type,
|
|
100
|
+
tier=tier,
|
|
101
|
+
confidence=round(confidence, 4),
|
|
102
|
+
signals=signals,
|
|
103
|
+
is_ambiguous=is_ambiguous,
|
|
104
|
+
raw_score=round(winner_score, 4),
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _score_all_types(
|
|
109
|
+
description: str,
|
|
110
|
+
keywords: dict[str, list[str]],
|
|
111
|
+
weights: dict[str, float],
|
|
112
|
+
) -> tuple[dict[str, float], list[str]]:
|
|
113
|
+
"""Score description against all task types.
|
|
114
|
+
|
|
115
|
+
For each task type, computes:
|
|
116
|
+
hit_ratio = matched_keywords / total_keywords
|
|
117
|
+
weighted_score = hit_ratio * type_weight
|
|
118
|
+
|
|
119
|
+
Returns (scores_dict, signals_list).
|
|
120
|
+
"""
|
|
121
|
+
desc_lower = description.lower()
|
|
122
|
+
scores: dict[str, float] = {}
|
|
123
|
+
signals: list[str] = []
|
|
124
|
+
|
|
125
|
+
for task_type, kw_list in keywords.items():
|
|
126
|
+
if not kw_list:
|
|
127
|
+
scores[task_type] = 0.0
|
|
128
|
+
continue
|
|
129
|
+
|
|
130
|
+
matches = [kw for kw in kw_list if kw in desc_lower]
|
|
131
|
+
|
|
132
|
+
if not matches:
|
|
133
|
+
scores[task_type] = 0.0
|
|
134
|
+
continue
|
|
135
|
+
|
|
136
|
+
hit_ratio = len(matches) / len(kw_list)
|
|
137
|
+
type_weight = weights.get(task_type, 1.0)
|
|
138
|
+
score = hit_ratio * type_weight
|
|
139
|
+
|
|
140
|
+
scores[task_type] = round(score, 4)
|
|
141
|
+
signals.append(f"{task_type}({', '.join(matches[:3])})")
|
|
142
|
+
|
|
143
|
+
return scores, signals
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _sigmoid_confidence(
|
|
147
|
+
winner_score: float,
|
|
148
|
+
runner_up_score: float,
|
|
149
|
+
steepness: float,
|
|
150
|
+
) -> float:
|
|
151
|
+
"""Compute confidence via sigmoid on score gap.
|
|
152
|
+
|
|
153
|
+
Maps the gap between winner and runner-up through a sigmoid
|
|
154
|
+
function to produce a value in (0, 1). A large gap = high
|
|
155
|
+
confidence; small gap = low confidence (~0.5).
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
winner_score: Score of the winning task type.
|
|
159
|
+
runner_up_score: Score of the second-place task type.
|
|
160
|
+
steepness: Sigmoid curve steepness (default 8.0).
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Confidence value in (0.0, 1.0).
|
|
164
|
+
"""
|
|
165
|
+
gap = winner_score - runner_up_score
|
|
166
|
+
return 1.0 / (1.0 + math.exp(-steepness * gap))
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _score_to_tier(
|
|
170
|
+
score: float,
|
|
171
|
+
boundaries: tuple[float, float],
|
|
172
|
+
) -> Tier:
|
|
173
|
+
"""Map a raw weighted score to a complexity tier.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
score: The winning type's weighted score.
|
|
177
|
+
boundaries: (low_max, medium_max) thresholds.
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
Tier.LOW, Tier.MEDIUM, or Tier.HIGH.
|
|
181
|
+
"""
|
|
182
|
+
low_max, medium_max = boundaries
|
|
183
|
+
|
|
184
|
+
if score <= low_max:
|
|
185
|
+
return Tier.LOW
|
|
186
|
+
if score <= medium_max:
|
|
187
|
+
return Tier.MEDIUM
|
|
188
|
+
return Tier.HIGH
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _metadata_bonus(
|
|
192
|
+
category: str | None,
|
|
193
|
+
acceptance_criteria_count: int,
|
|
194
|
+
files_affected: int,
|
|
195
|
+
) -> float:
|
|
196
|
+
"""Compute bonus score from structured task metadata.
|
|
197
|
+
|
|
198
|
+
Mirrors the heuristic signals from BFAgent's TestRequirement
|
|
199
|
+
and LLMRouter that use category, criteria count, and file
|
|
200
|
+
count as scoring dimensions beyond keyword matching.
|
|
201
|
+
|
|
202
|
+
Returns a bonus value to add to the matching task type score.
|
|
203
|
+
"""
|
|
204
|
+
bonus = 0.0
|
|
205
|
+
|
|
206
|
+
if acceptance_criteria_count >= 5:
|
|
207
|
+
bonus += 0.4
|
|
208
|
+
elif acceptance_criteria_count >= 2:
|
|
209
|
+
bonus += 0.2
|
|
210
|
+
|
|
211
|
+
if files_affected > 5:
|
|
212
|
+
bonus += 0.5
|
|
213
|
+
elif files_affected > 2:
|
|
214
|
+
bonus += 0.2
|
|
215
|
+
|
|
216
|
+
return bonus
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Type definitions for task_scorer.
|
|
3
|
+
|
|
4
|
+
All public types used by the scoring engine. Uses stdlib-only
|
|
5
|
+
dataclasses (no Pydantic) to maintain zero-dependency constraint.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from enum import Enum
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Tier(Enum):
|
|
15
|
+
"""Complexity tier for a scored task."""
|
|
16
|
+
|
|
17
|
+
LOW = "low"
|
|
18
|
+
MEDIUM = "medium"
|
|
19
|
+
HIGH = "high"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass(frozen=True)
|
|
23
|
+
class ScoringConfig:
|
|
24
|
+
"""Configuration for the scoring engine.
|
|
25
|
+
|
|
26
|
+
All fields have sensible defaults. Override individual fields
|
|
27
|
+
to customize scoring behavior (e.g. inject DB-driven keywords).
|
|
28
|
+
|
|
29
|
+
Attributes:
|
|
30
|
+
keywords: Mapping of task_type -> list of keywords.
|
|
31
|
+
Each keyword is matched as a substring in the description.
|
|
32
|
+
weights: Mapping of task_type -> weight multiplier.
|
|
33
|
+
Higher weight = stronger signal when keywords match.
|
|
34
|
+
tier_boundaries: (low_max, medium_max) score thresholds.
|
|
35
|
+
score <= low_max -> LOW, score <= medium_max -> MEDIUM, else HIGH.
|
|
36
|
+
confidence_steepness: Sigmoid steepness for confidence calibration.
|
|
37
|
+
Higher = sharper transition between confident/uncertain.
|
|
38
|
+
confidence_threshold: Below this confidence, result is flagged
|
|
39
|
+
as ambiguous (result.is_ambiguous = True).
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
keywords: dict[str, list[str]] = field(default_factory=dict)
|
|
43
|
+
weights: dict[str, float] = field(default_factory=dict)
|
|
44
|
+
tier_boundaries: tuple[float, float] = (1.0, 4.0)
|
|
45
|
+
confidence_steepness: float = 8.0
|
|
46
|
+
confidence_threshold: float = 0.65
|
|
47
|
+
|
|
48
|
+
def __post_init__(self) -> None:
|
|
49
|
+
"""Apply defaults for missing keywords/weights."""
|
|
50
|
+
if not self.keywords:
|
|
51
|
+
# frozen=True requires object.__setattr__
|
|
52
|
+
object.__setattr__(self, "keywords", DEFAULT_KEYWORDS.copy())
|
|
53
|
+
if not self.weights:
|
|
54
|
+
object.__setattr__(self, "weights", DEFAULT_WEIGHTS.copy())
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass(frozen=True)
|
|
58
|
+
class ScoringResult:
|
|
59
|
+
"""Result of scoring a task description.
|
|
60
|
+
|
|
61
|
+
Attributes:
|
|
62
|
+
scores: All type scores: {"security": 1.5, "bug": 0.3, ...}.
|
|
63
|
+
top_type: Highest scoring type (e.g. "security").
|
|
64
|
+
tier: Mapped complexity tier (LOW/MEDIUM/HIGH).
|
|
65
|
+
confidence: Sigmoid confidence [0.0, 1.0] based on score gap.
|
|
66
|
+
signals: Debug signals showing which keywords matched.
|
|
67
|
+
is_ambiguous: True if confidence < threshold.
|
|
68
|
+
raw_score: The winning type's raw weighted score.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
scores: dict[str, float]
|
|
72
|
+
top_type: str
|
|
73
|
+
tier: Tier
|
|
74
|
+
confidence: float
|
|
75
|
+
signals: list[str]
|
|
76
|
+
is_ambiguous: bool
|
|
77
|
+
raw_score: float
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# ============================================================================
|
|
81
|
+
# DEFAULT CONFIGURATION
|
|
82
|
+
# Unified keyword set from BFAgent + Orchestrator + ClawRouter analysis.
|
|
83
|
+
# See ADR-023 Appendix A for divergence analysis.
|
|
84
|
+
# ============================================================================
|
|
85
|
+
|
|
86
|
+
DEFAULT_KEYWORDS: dict[str, list[str]] = {
|
|
87
|
+
"architecture": [
|
|
88
|
+
"architecture", "architektur", "design", "adr", "strategy",
|
|
89
|
+
"integration", "pattern", "diagram",
|
|
90
|
+
],
|
|
91
|
+
"security": [
|
|
92
|
+
"security", "auth", "authentication", "authorization",
|
|
93
|
+
"permission", "credential", "secret", "vulnerability", "cve",
|
|
94
|
+
],
|
|
95
|
+
"breaking_change": [
|
|
96
|
+
"breaking", "migrate", "migration", "deprecate", "remove",
|
|
97
|
+
"drop support", "incompatible",
|
|
98
|
+
],
|
|
99
|
+
"feature": [
|
|
100
|
+
"feature", "add", "implement", "create", "new",
|
|
101
|
+
"build", "introduce",
|
|
102
|
+
],
|
|
103
|
+
"bug": [
|
|
104
|
+
"bug", "fix", "issue", "error", "broken",
|
|
105
|
+
"crash", "regression", "fails",
|
|
106
|
+
],
|
|
107
|
+
"refactor": [
|
|
108
|
+
"refactor", "clean", "improve", "optimize", "optimization",
|
|
109
|
+
"simplify", "extract", "reorganize", "caching", "performance",
|
|
110
|
+
],
|
|
111
|
+
"test": [
|
|
112
|
+
"test", "spec", "coverage", "pytest", "assert",
|
|
113
|
+
"mock", "fixture", "integration",
|
|
114
|
+
],
|
|
115
|
+
"docs": [
|
|
116
|
+
"doc", "readme", "comment", "docstring", "documentation",
|
|
117
|
+
"sphinx", "changelog",
|
|
118
|
+
],
|
|
119
|
+
"lint": [
|
|
120
|
+
"lint", "format", "formatting", "style", "ruff", "flake8",
|
|
121
|
+
],
|
|
122
|
+
"typo": [
|
|
123
|
+
"typo", "spelling", "minor", "whitespace", "css",
|
|
124
|
+
"label", "text", "button", "spacing", "icon",
|
|
125
|
+
],
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
DEFAULT_WEIGHTS: dict[str, float] = {
|
|
129
|
+
"architecture": 1.5,
|
|
130
|
+
"security": 1.5,
|
|
131
|
+
"breaking_change": 1.4,
|
|
132
|
+
"feature": 1.0,
|
|
133
|
+
"bug": 1.0,
|
|
134
|
+
"refactor": 0.9,
|
|
135
|
+
"test": 0.8,
|
|
136
|
+
"docs": 0.7,
|
|
137
|
+
"lint": 0.6,
|
|
138
|
+
"typo": 0.5,
|
|
139
|
+
}
|
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tests for the task_scorer package.
|
|
3
|
+
|
|
4
|
+
Covers: score_task, _score_all_types, _sigmoid_confidence,
|
|
5
|
+
_score_to_tier, _metadata_bonus, ScoringConfig defaults,
|
|
6
|
+
custom config injection, edge cases.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import pytest
|
|
10
|
+
|
|
11
|
+
from task_scorer import ScoringConfig, ScoringResult, Tier, score_task
|
|
12
|
+
from task_scorer.scorer import (
|
|
13
|
+
_metadata_bonus,
|
|
14
|
+
_score_all_types,
|
|
15
|
+
_score_to_tier,
|
|
16
|
+
_sigmoid_confidence,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# ============================================================================
|
|
21
|
+
# _score_all_types
|
|
22
|
+
# ============================================================================
|
|
23
|
+
|
|
24
|
+
class TestScoreAllTypes:
|
|
25
|
+
"""Tests for weighted keyword scoring."""
|
|
26
|
+
|
|
27
|
+
def test_should_return_scores_for_all_types(self) -> None:
|
|
28
|
+
config = ScoringConfig()
|
|
29
|
+
scores, signals = _score_all_types(
|
|
30
|
+
"hello world", config.keywords, config.weights,
|
|
31
|
+
)
|
|
32
|
+
assert len(scores) == len(config.keywords)
|
|
33
|
+
|
|
34
|
+
def test_should_score_zero_for_no_matches(self) -> None:
|
|
35
|
+
config = ScoringConfig()
|
|
36
|
+
scores, signals = _score_all_types(
|
|
37
|
+
"xyzzy", config.keywords, config.weights,
|
|
38
|
+
)
|
|
39
|
+
assert all(v == 0.0 for v in scores.values())
|
|
40
|
+
assert signals == []
|
|
41
|
+
|
|
42
|
+
def test_should_score_bug_highest_for_bug_text(self) -> None:
|
|
43
|
+
config = ScoringConfig()
|
|
44
|
+
scores, _ = _score_all_types(
|
|
45
|
+
"fix the bug crash error", config.keywords, config.weights,
|
|
46
|
+
)
|
|
47
|
+
max_type = max(scores, key=scores.get)
|
|
48
|
+
assert max_type == "bug"
|
|
49
|
+
assert scores["bug"] > 0
|
|
50
|
+
|
|
51
|
+
def test_should_score_security_highest_for_auth(self) -> None:
|
|
52
|
+
config = ScoringConfig()
|
|
53
|
+
scores, _ = _score_all_types(
|
|
54
|
+
"fix auth permission credential vulnerability",
|
|
55
|
+
config.keywords, config.weights,
|
|
56
|
+
)
|
|
57
|
+
max_type = max(scores, key=scores.get)
|
|
58
|
+
assert max_type == "security"
|
|
59
|
+
|
|
60
|
+
def test_should_generate_signals_for_matches(self) -> None:
|
|
61
|
+
config = ScoringConfig()
|
|
62
|
+
_, signals = _score_all_types(
|
|
63
|
+
"refactor and clean the code", config.keywords, config.weights,
|
|
64
|
+
)
|
|
65
|
+
assert any("refactor" in s for s in signals)
|
|
66
|
+
|
|
67
|
+
def test_should_handle_empty_keyword_list(self) -> None:
|
|
68
|
+
scores, signals = _score_all_types(
|
|
69
|
+
"test", {"custom": []}, {"custom": 1.0},
|
|
70
|
+
)
|
|
71
|
+
assert scores["custom"] == 0.0
|
|
72
|
+
|
|
73
|
+
def test_should_apply_weights(self) -> None:
|
|
74
|
+
scores_high, _ = _score_all_types(
|
|
75
|
+
"auth", {"security": ["auth"]}, {"security": 2.0},
|
|
76
|
+
)
|
|
77
|
+
scores_low, _ = _score_all_types(
|
|
78
|
+
"auth", {"security": ["auth"]}, {"security": 0.5},
|
|
79
|
+
)
|
|
80
|
+
assert scores_high["security"] > scores_low["security"]
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
# ============================================================================
|
|
84
|
+
# _sigmoid_confidence
|
|
85
|
+
# ============================================================================
|
|
86
|
+
|
|
87
|
+
class TestSigmoidConfidence:
|
|
88
|
+
"""Tests for sigmoid confidence calibration."""
|
|
89
|
+
|
|
90
|
+
def test_should_return_0_5_for_zero_gap(self) -> None:
|
|
91
|
+
conf = _sigmoid_confidence(0.5, 0.5, steepness=8.0)
|
|
92
|
+
assert conf == pytest.approx(0.5, abs=0.01)
|
|
93
|
+
|
|
94
|
+
def test_should_return_high_for_large_gap(self) -> None:
|
|
95
|
+
conf = _sigmoid_confidence(1.0, 0.0, steepness=8.0)
|
|
96
|
+
assert conf > 0.95
|
|
97
|
+
|
|
98
|
+
def test_should_return_low_for_negative_gap(self) -> None:
|
|
99
|
+
conf = _sigmoid_confidence(0.0, 1.0, steepness=8.0)
|
|
100
|
+
assert conf < 0.05
|
|
101
|
+
|
|
102
|
+
def test_should_be_monotonically_increasing(self) -> None:
|
|
103
|
+
c1 = _sigmoid_confidence(0.5, 0.4, steepness=8.0)
|
|
104
|
+
c2 = _sigmoid_confidence(0.5, 0.2, steepness=8.0)
|
|
105
|
+
c3 = _sigmoid_confidence(0.5, 0.0, steepness=8.0)
|
|
106
|
+
assert c1 < c2 < c3
|
|
107
|
+
|
|
108
|
+
def test_should_respect_steepness(self) -> None:
|
|
109
|
+
c_flat = _sigmoid_confidence(0.6, 0.4, steepness=2.0)
|
|
110
|
+
c_steep = _sigmoid_confidence(0.6, 0.4, steepness=20.0)
|
|
111
|
+
# Steeper sigmoid = more extreme confidence for same gap
|
|
112
|
+
assert c_steep > c_flat
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
# ============================================================================
|
|
116
|
+
# _score_to_tier
|
|
117
|
+
# ============================================================================
|
|
118
|
+
|
|
119
|
+
class TestScoreToTier:
|
|
120
|
+
"""Tests for score-to-tier mapping."""
|
|
121
|
+
|
|
122
|
+
def test_should_return_low_for_small_score(self) -> None:
|
|
123
|
+
assert _score_to_tier(0.5, (1.0, 4.0)) == Tier.LOW
|
|
124
|
+
|
|
125
|
+
def test_should_return_medium_for_mid_score(self) -> None:
|
|
126
|
+
assert _score_to_tier(2.0, (1.0, 4.0)) == Tier.MEDIUM
|
|
127
|
+
|
|
128
|
+
def test_should_return_high_for_large_score(self) -> None:
|
|
129
|
+
assert _score_to_tier(5.0, (1.0, 4.0)) == Tier.HIGH
|
|
130
|
+
|
|
131
|
+
def test_should_handle_boundary_low_max(self) -> None:
|
|
132
|
+
assert _score_to_tier(1.0, (1.0, 4.0)) == Tier.LOW
|
|
133
|
+
|
|
134
|
+
def test_should_handle_boundary_medium_max(self) -> None:
|
|
135
|
+
assert _score_to_tier(4.0, (1.0, 4.0)) == Tier.MEDIUM
|
|
136
|
+
|
|
137
|
+
def test_should_respect_custom_boundaries(self) -> None:
|
|
138
|
+
assert _score_to_tier(0.3, (0.2, 0.5)) == Tier.MEDIUM
|
|
139
|
+
assert _score_to_tier(0.6, (0.2, 0.5)) == Tier.HIGH
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
# ============================================================================
|
|
143
|
+
# _metadata_bonus
|
|
144
|
+
# ============================================================================
|
|
145
|
+
|
|
146
|
+
class TestMetadataBonus:
|
|
147
|
+
"""Tests for metadata bonus scoring."""
|
|
148
|
+
|
|
149
|
+
def test_should_return_zero_for_no_metadata(self) -> None:
|
|
150
|
+
assert _metadata_bonus(None, 0, 0) == 0.0
|
|
151
|
+
|
|
152
|
+
def test_should_add_bonus_for_many_criteria(self) -> None:
|
|
153
|
+
bonus = _metadata_bonus(None, 5, 0)
|
|
154
|
+
assert bonus == pytest.approx(0.4)
|
|
155
|
+
|
|
156
|
+
def test_should_add_bonus_for_some_criteria(self) -> None:
|
|
157
|
+
bonus = _metadata_bonus(None, 3, 0)
|
|
158
|
+
assert bonus == pytest.approx(0.2)
|
|
159
|
+
|
|
160
|
+
def test_should_add_bonus_for_many_files(self) -> None:
|
|
161
|
+
bonus = _metadata_bonus(None, 0, 6)
|
|
162
|
+
assert bonus == pytest.approx(0.5)
|
|
163
|
+
|
|
164
|
+
def test_should_add_bonus_for_some_files(self) -> None:
|
|
165
|
+
bonus = _metadata_bonus(None, 0, 3)
|
|
166
|
+
assert bonus == pytest.approx(0.2)
|
|
167
|
+
|
|
168
|
+
def test_should_combine_criteria_and_files(self) -> None:
|
|
169
|
+
bonus = _metadata_bonus(None, 5, 6)
|
|
170
|
+
assert bonus == pytest.approx(0.9)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
# ============================================================================
|
|
174
|
+
# score_task (integration)
|
|
175
|
+
# ============================================================================
|
|
176
|
+
|
|
177
|
+
class TestScoreTask:
|
|
178
|
+
"""Integration tests for the main score_task function."""
|
|
179
|
+
|
|
180
|
+
def test_should_return_scoring_result(self) -> None:
|
|
181
|
+
result = score_task("fix the bug")
|
|
182
|
+
assert isinstance(result, ScoringResult)
|
|
183
|
+
assert isinstance(result.tier, Tier)
|
|
184
|
+
assert 0 <= result.confidence <= 1
|
|
185
|
+
|
|
186
|
+
def test_should_detect_clear_bug(self) -> None:
|
|
187
|
+
result = score_task("fix the bug crash error regression")
|
|
188
|
+
assert result.top_type == "bug"
|
|
189
|
+
assert result.confidence > 0.7
|
|
190
|
+
|
|
191
|
+
def test_should_detect_clear_security(self) -> None:
|
|
192
|
+
result = score_task("auth permission credential vulnerability")
|
|
193
|
+
assert result.top_type == "security"
|
|
194
|
+
assert result.confidence > 0.7
|
|
195
|
+
|
|
196
|
+
def test_should_detect_architecture(self) -> None:
|
|
197
|
+
result = score_task("design the architecture pattern strategy")
|
|
198
|
+
assert result.top_type == "architecture"
|
|
199
|
+
|
|
200
|
+
def test_should_detect_refactor(self) -> None:
|
|
201
|
+
result = score_task("refactor clean improve optimize simplify")
|
|
202
|
+
assert result.top_type == "refactor"
|
|
203
|
+
|
|
204
|
+
def test_should_detect_docs(self) -> None:
|
|
205
|
+
result = score_task("update readme docstring documentation")
|
|
206
|
+
assert result.top_type == "docs"
|
|
207
|
+
|
|
208
|
+
def test_should_default_to_feature_no_keywords(self) -> None:
|
|
209
|
+
result = score_task("do something completely random xyzzy")
|
|
210
|
+
assert result.top_type == "feature"
|
|
211
|
+
assert result.confidence == 0.5
|
|
212
|
+
assert result.is_ambiguous is True
|
|
213
|
+
|
|
214
|
+
def test_should_handle_empty_string(self) -> None:
|
|
215
|
+
result = score_task("")
|
|
216
|
+
assert result.top_type == "feature"
|
|
217
|
+
assert result.is_ambiguous is True
|
|
218
|
+
|
|
219
|
+
def test_should_flag_ambiguous_on_low_confidence(self) -> None:
|
|
220
|
+
result = score_task("fix the architecture docs")
|
|
221
|
+
# Multiple types match = lower confidence
|
|
222
|
+
assert result.confidence < 1.0
|
|
223
|
+
|
|
224
|
+
def test_should_include_signals(self) -> None:
|
|
225
|
+
result = score_task("fix bug and add feature")
|
|
226
|
+
assert len(result.signals) >= 2
|
|
227
|
+
|
|
228
|
+
def test_should_include_all_scores(self) -> None:
|
|
229
|
+
result = score_task("fix the bug")
|
|
230
|
+
assert "bug" in result.scores
|
|
231
|
+
assert "security" in result.scores
|
|
232
|
+
|
|
233
|
+
def test_should_apply_category_bonus(self) -> None:
|
|
234
|
+
result_no_cat = score_task("fix something")
|
|
235
|
+
result_with_cat = score_task("fix something", category="security")
|
|
236
|
+
# Category bonus shouldn't change score if no security keywords
|
|
237
|
+
assert result_no_cat.scores.get("security", 0) == result_with_cat.scores.get("security", 0)
|
|
238
|
+
|
|
239
|
+
def test_should_apply_criteria_bonus(self) -> None:
|
|
240
|
+
result = score_task(
|
|
241
|
+
"security auth vulnerability",
|
|
242
|
+
category="security",
|
|
243
|
+
acceptance_criteria_count=6,
|
|
244
|
+
)
|
|
245
|
+
assert result.top_type == "security"
|
|
246
|
+
assert "category_bonus" in " ".join(result.signals)
|
|
247
|
+
|
|
248
|
+
def test_should_apply_files_bonus_with_category(self) -> None:
|
|
249
|
+
result = score_task(
|
|
250
|
+
"security auth vulnerability",
|
|
251
|
+
category="security",
|
|
252
|
+
files_affected=10,
|
|
253
|
+
)
|
|
254
|
+
assert result.scores["security"] > 0
|
|
255
|
+
|
|
256
|
+
def test_should_be_idempotent(self) -> None:
|
|
257
|
+
r1 = score_task("fix the authentication bug")
|
|
258
|
+
r2 = score_task("fix the authentication bug")
|
|
259
|
+
assert r1.top_type == r2.top_type
|
|
260
|
+
assert r1.confidence == r2.confidence
|
|
261
|
+
assert r1.tier == r2.tier
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
# ============================================================================
|
|
265
|
+
# ScoringConfig
|
|
266
|
+
# ============================================================================
|
|
267
|
+
|
|
268
|
+
class TestScoringConfig:
|
|
269
|
+
"""Tests for config defaults and custom injection."""
|
|
270
|
+
|
|
271
|
+
def test_should_have_defaults(self) -> None:
|
|
272
|
+
config = ScoringConfig()
|
|
273
|
+
assert len(config.keywords) > 0
|
|
274
|
+
assert len(config.weights) > 0
|
|
275
|
+
assert config.confidence_steepness == 8.0
|
|
276
|
+
|
|
277
|
+
def test_should_accept_custom_keywords(self) -> None:
|
|
278
|
+
custom = ScoringConfig(
|
|
279
|
+
keywords={"custom_type": ["alpha", "beta"]},
|
|
280
|
+
weights={"custom_type": 1.0},
|
|
281
|
+
)
|
|
282
|
+
result = score_task("alpha beta gamma", config=custom)
|
|
283
|
+
assert result.top_type == "custom_type"
|
|
284
|
+
|
|
285
|
+
def test_should_accept_custom_boundaries(self) -> None:
|
|
286
|
+
narrow = ScoringConfig(tier_boundaries=(0.1, 0.2))
|
|
287
|
+
result = score_task("fix the bug", config=narrow)
|
|
288
|
+
# With very narrow boundaries, most scores should be HIGH
|
|
289
|
+
assert result.tier == Tier.HIGH
|
|
290
|
+
|
|
291
|
+
def test_should_accept_custom_steepness(self) -> None:
|
|
292
|
+
flat = ScoringConfig(confidence_steepness=1.0)
|
|
293
|
+
steep = ScoringConfig(confidence_steepness=50.0)
|
|
294
|
+
r_flat = score_task("fix the bug crash", config=flat)
|
|
295
|
+
r_steep = score_task("fix the bug crash", config=steep)
|
|
296
|
+
# Steeper produces more extreme confidence
|
|
297
|
+
assert r_steep.confidence >= r_flat.confidence
|
|
298
|
+
|
|
299
|
+
def test_should_be_frozen(self) -> None:
|
|
300
|
+
config = ScoringConfig()
|
|
301
|
+
with pytest.raises(AttributeError):
|
|
302
|
+
config.confidence_steepness = 99.0
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
# ============================================================================
|
|
306
|
+
# Tier enum
|
|
307
|
+
# ============================================================================
|
|
308
|
+
|
|
309
|
+
class TestTier:
|
|
310
|
+
"""Tests for Tier enum values."""
|
|
311
|
+
|
|
312
|
+
def test_should_have_three_values(self) -> None:
|
|
313
|
+
assert len(Tier) == 3
|
|
314
|
+
|
|
315
|
+
def test_should_have_string_values(self) -> None:
|
|
316
|
+
assert Tier.LOW.value == "low"
|
|
317
|
+
assert Tier.MEDIUM.value == "medium"
|
|
318
|
+
assert Tier.HIGH.value == "high"
|
|
319
|
+
|
|
320
|
+
def test_should_be_constructible_from_string(self) -> None:
|
|
321
|
+
assert Tier("low") == Tier.LOW
|
|
322
|
+
assert Tier("high") == Tier.HIGH
|