code-maat-python 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_maat_python/__init__.py +12 -0
- code_maat_python/__main__.py +5 -0
- code_maat_python/analyses/__init__.py +39 -0
- code_maat_python/analyses/age.py +101 -0
- code_maat_python/analyses/authors.py +60 -0
- code_maat_python/analyses/churn.py +353 -0
- code_maat_python/analyses/communication.py +151 -0
- code_maat_python/analyses/coupling.py +136 -0
- code_maat_python/analyses/effort.py +210 -0
- code_maat_python/analyses/entities.py +51 -0
- code_maat_python/analyses/revisions.py +56 -0
- code_maat_python/analyses/soc.py +90 -0
- code_maat_python/analyses/summary.py +61 -0
- code_maat_python/cli.py +822 -0
- code_maat_python/output/__init__.py +0 -0
- code_maat_python/parser.py +232 -0
- code_maat_python/pipeline.py +112 -0
- code_maat_python/transformers/__init__.py +0 -0
- code_maat_python/transformers/grouper.py +204 -0
- code_maat_python/transformers/team_mapper.py +132 -0
- code_maat_python/transformers/time_grouper.py +146 -0
- code_maat_python/utils/__init__.py +0 -0
- code_maat_python/utils/math.py +105 -0
- code_maat_python-0.1.0.dist-info/METADATA +545 -0
- code_maat_python-0.1.0.dist-info/RECORD +28 -0
- code_maat_python-0.1.0.dist-info/WHEEL +4 -0
- code_maat_python-0.1.0.dist-info/entry_points.txt +3 -0
- code_maat_python-0.1.0.dist-info/licenses/LICENSE +674 -0
|
File without changes
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
"""Git log parser for code-maat-python.
|
|
2
|
+
|
|
3
|
+
Parses git log output into pandas DataFrames for analysis.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Iterator, TextIO
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
import re
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class CommitRecord:
|
|
16
|
+
"""Represents a single commit with file changes."""
|
|
17
|
+
|
|
18
|
+
rev: str
|
|
19
|
+
date: str
|
|
20
|
+
author: str
|
|
21
|
+
entity: str
|
|
22
|
+
loc_added: int
|
|
23
|
+
loc_deleted: int
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class GitLogSchema:
|
|
27
|
+
"""Defines the DataFrame schema for parsed git logs."""
|
|
28
|
+
|
|
29
|
+
# Column names
|
|
30
|
+
REV = "rev"
|
|
31
|
+
DATE = "date"
|
|
32
|
+
AUTHOR = "author"
|
|
33
|
+
ENTITY = "entity"
|
|
34
|
+
LOC_ADDED = "loc_added"
|
|
35
|
+
LOC_DELETED = "loc_deleted"
|
|
36
|
+
|
|
37
|
+
# Column types for optimization
|
|
38
|
+
DTYPES = {
|
|
39
|
+
REV: "category",
|
|
40
|
+
AUTHOR: "category",
|
|
41
|
+
ENTITY: "category",
|
|
42
|
+
LOC_ADDED: "int32",
|
|
43
|
+
LOC_DELETED: "int32",
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
@classmethod
|
|
47
|
+
def columns(cls) -> list[str]:
|
|
48
|
+
"""Return ordered list of column names."""
|
|
49
|
+
return [cls.REV, cls.DATE, cls.AUTHOR, cls.ENTITY, cls.LOC_ADDED, cls.LOC_DELETED]
|
|
50
|
+
|
|
51
|
+
@classmethod
|
|
52
|
+
def create_empty_dataframe(cls) -> pd.DataFrame:
|
|
53
|
+
"""Create an empty DataFrame with the correct schema."""
|
|
54
|
+
df = pd.DataFrame(columns=cls.columns())
|
|
55
|
+
df[cls.DATE] = pd.to_datetime(df[cls.DATE])
|
|
56
|
+
return df
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class GitLogParser:
|
|
60
|
+
"""Parser for git log output.
|
|
61
|
+
|
|
62
|
+
Parses output from:
|
|
63
|
+
git log --all -M -C --numstat --date=short --pretty=format:'--%h--%cd--%cn'
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
# Regex patterns
|
|
67
|
+
COMMIT_PATTERN = re.compile(r"^--([a-f0-9]+)--(\d{4}-\d{2}-\d{2})--(.*)$")
|
|
68
|
+
CHANGE_PATTERN = re.compile(r"^(\d+|-)\t(\d+|-)\t(.+)$")
|
|
69
|
+
|
|
70
|
+
def __init__(self) -> None:
|
|
71
|
+
"""Initialize the parser."""
|
|
72
|
+
self._current_commit: dict[str, str] | None = None
|
|
73
|
+
self._records: list[CommitRecord] = []
|
|
74
|
+
|
|
75
|
+
def parse_file(self, filepath: Path | str) -> pd.DataFrame:
|
|
76
|
+
"""Parse a git log file.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
filepath: Path to the git log file
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
DataFrame with columns: rev, date, author, entity, loc_added, loc_deleted
|
|
83
|
+
|
|
84
|
+
Raises:
|
|
85
|
+
FileNotFoundError: If the file doesn't exist
|
|
86
|
+
ValueError: If the file format is invalid
|
|
87
|
+
"""
|
|
88
|
+
filepath = Path(filepath)
|
|
89
|
+
if not filepath.exists():
|
|
90
|
+
raise FileNotFoundError(f"Git log file not found: {filepath}")
|
|
91
|
+
|
|
92
|
+
with open(filepath, "r", encoding="utf-8") as f:
|
|
93
|
+
return self.parse_stream(f)
|
|
94
|
+
|
|
95
|
+
def parse_stream(self, stream: TextIO) -> pd.DataFrame:
|
|
96
|
+
"""Parse a git log from a stream.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
stream: Text stream (file object) containing git log
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
DataFrame with parsed commit data
|
|
103
|
+
"""
|
|
104
|
+
self._current_commit = None
|
|
105
|
+
self._records = []
|
|
106
|
+
|
|
107
|
+
for line_num, line in enumerate(stream, start=1):
|
|
108
|
+
line = line.strip()
|
|
109
|
+
|
|
110
|
+
# Skip empty lines
|
|
111
|
+
if not line:
|
|
112
|
+
continue
|
|
113
|
+
|
|
114
|
+
# Try to parse as commit header
|
|
115
|
+
if self._try_parse_commit(line):
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
# Try to parse as file change
|
|
119
|
+
if self._try_parse_change(line, line_num):
|
|
120
|
+
continue
|
|
121
|
+
|
|
122
|
+
# If we get here, the line didn't match any pattern
|
|
123
|
+
# This might be okay (e.g., continuation of author name with newline)
|
|
124
|
+
# But we should warn if it looks like data
|
|
125
|
+
if line.strip() and not line.startswith(" "):
|
|
126
|
+
print(f"Warning: Unparsed line {line_num}: {line[:50]}")
|
|
127
|
+
|
|
128
|
+
return self._to_dataframe()
|
|
129
|
+
|
|
130
|
+
def _try_parse_commit(self, line: str) -> bool:
|
|
131
|
+
"""Try to parse line as commit header.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
line: Line from git log
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
True if line was a commit header, False otherwise
|
|
138
|
+
"""
|
|
139
|
+
match = self.COMMIT_PATTERN.match(line)
|
|
140
|
+
if match:
|
|
141
|
+
self._current_commit = {
|
|
142
|
+
"rev": match.group(1),
|
|
143
|
+
"date": match.group(2),
|
|
144
|
+
"author": match.group(3).strip(),
|
|
145
|
+
}
|
|
146
|
+
return True
|
|
147
|
+
return False
|
|
148
|
+
|
|
149
|
+
def _try_parse_change(self, line: str, line_num: int) -> bool:
|
|
150
|
+
"""Try to parse line as file change.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
line: Line from git log
|
|
154
|
+
line_num: Line number (for error reporting)
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
True if line was a file change, False otherwise
|
|
158
|
+
"""
|
|
159
|
+
match = self.CHANGE_PATTERN.match(line)
|
|
160
|
+
if match and self._current_commit:
|
|
161
|
+
# Parse added/deleted, handling binary files (-)
|
|
162
|
+
loc_added = 0 if match.group(1) == "-" else int(match.group(1))
|
|
163
|
+
loc_deleted = 0 if match.group(2) == "-" else int(match.group(2))
|
|
164
|
+
entity = match.group(3)
|
|
165
|
+
|
|
166
|
+
record = CommitRecord(
|
|
167
|
+
rev=self._current_commit["rev"],
|
|
168
|
+
date=self._current_commit["date"],
|
|
169
|
+
author=self._current_commit["author"],
|
|
170
|
+
entity=entity,
|
|
171
|
+
loc_added=loc_added,
|
|
172
|
+
loc_deleted=loc_deleted,
|
|
173
|
+
)
|
|
174
|
+
self._records.append(record)
|
|
175
|
+
return True
|
|
176
|
+
|
|
177
|
+
if match and not self._current_commit:
|
|
178
|
+
print(f"Warning: File change without commit header at line {line_num}")
|
|
179
|
+
return False
|
|
180
|
+
|
|
181
|
+
return False
|
|
182
|
+
|
|
183
|
+
def _to_dataframe(self) -> pd.DataFrame:
|
|
184
|
+
"""Convert parsed records to DataFrame.
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
DataFrame with proper schema and types
|
|
188
|
+
"""
|
|
189
|
+
if not self._records:
|
|
190
|
+
return GitLogSchema.create_empty_dataframe()
|
|
191
|
+
|
|
192
|
+
# Convert records to dict
|
|
193
|
+
data = {
|
|
194
|
+
GitLogSchema.REV: [r.rev for r in self._records],
|
|
195
|
+
GitLogSchema.DATE: [r.date for r in self._records],
|
|
196
|
+
GitLogSchema.AUTHOR: [r.author for r in self._records],
|
|
197
|
+
GitLogSchema.ENTITY: [r.entity for r in self._records],
|
|
198
|
+
GitLogSchema.LOC_ADDED: [r.loc_added for r in self._records],
|
|
199
|
+
GitLogSchema.LOC_DELETED: [r.loc_deleted for r in self._records],
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
df = pd.DataFrame(data)
|
|
203
|
+
|
|
204
|
+
# Convert date to datetime
|
|
205
|
+
df[GitLogSchema.DATE] = pd.to_datetime(df[GitLogSchema.DATE])
|
|
206
|
+
|
|
207
|
+
# Apply categorical types for memory efficiency
|
|
208
|
+
df[GitLogSchema.REV] = df[GitLogSchema.REV].astype("category")
|
|
209
|
+
df[GitLogSchema.AUTHOR] = df[GitLogSchema.AUTHOR].astype("category")
|
|
210
|
+
df[GitLogSchema.ENTITY] = df[GitLogSchema.ENTITY].astype("category")
|
|
211
|
+
|
|
212
|
+
return df
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
# Convenience function
|
|
216
|
+
def parse_git_log(filepath: Path | str) -> pd.DataFrame:
|
|
217
|
+
"""Parse a git log file.
|
|
218
|
+
|
|
219
|
+
Convenience function for quick parsing.
|
|
220
|
+
|
|
221
|
+
Args:
|
|
222
|
+
filepath: Path to git log file
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
DataFrame with parsed commit data
|
|
226
|
+
|
|
227
|
+
Example:
|
|
228
|
+
>>> df = parse_git_log("git.log")
|
|
229
|
+
>>> print(df.head())
|
|
230
|
+
"""
|
|
231
|
+
parser = GitLogParser()
|
|
232
|
+
return parser.parse_file(filepath)
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""Data pipeline utilities for code-maat-python.
|
|
2
|
+
|
|
3
|
+
Provides utilities for transforming and filtering DataFrames between
|
|
4
|
+
parsing and analysis stages.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Callable
|
|
10
|
+
|
|
11
|
+
from code_maat_python.parser import parse_git_log, GitLogSchema
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def load_git_log(filepath: Path | str) -> pd.DataFrame:
|
|
15
|
+
"""Load and parse a git log file.
|
|
16
|
+
|
|
17
|
+
Convenience function that wraps the parser.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
filepath: Path to git log file
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
Parsed DataFrame
|
|
24
|
+
|
|
25
|
+
Raises:
|
|
26
|
+
FileNotFoundError: If file doesn't exist
|
|
27
|
+
"""
|
|
28
|
+
return parse_git_log(filepath)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def filter_by_date(
|
|
32
|
+
df: pd.DataFrame, start_date: str | None = None, end_date: str | None = None
|
|
33
|
+
) -> pd.DataFrame:
|
|
34
|
+
"""Filter DataFrame by date range.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
df: Input DataFrame
|
|
38
|
+
start_date: Start date (YYYY-MM-DD) or None for no start limit
|
|
39
|
+
end_date: End date (YYYY-MM-DD) or None for no end limit
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
Filtered DataFrame
|
|
43
|
+
|
|
44
|
+
Example:
|
|
45
|
+
>>> df_filtered = filter_by_date(df, start_date="2023-01-01")
|
|
46
|
+
"""
|
|
47
|
+
result = df.copy()
|
|
48
|
+
|
|
49
|
+
if start_date:
|
|
50
|
+
start = pd.to_datetime(start_date)
|
|
51
|
+
result = result[result[GitLogSchema.DATE] >= start]
|
|
52
|
+
|
|
53
|
+
if end_date:
|
|
54
|
+
end = pd.to_datetime(end_date)
|
|
55
|
+
result = result[result[GitLogSchema.DATE] <= end]
|
|
56
|
+
|
|
57
|
+
return result
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def filter_by_entity_pattern(df: pd.DataFrame, pattern: str) -> pd.DataFrame:
|
|
61
|
+
"""Filter DataFrame by entity path pattern.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
df: Input DataFrame
|
|
65
|
+
pattern: Regex pattern to match against entity paths
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
Filtered DataFrame
|
|
69
|
+
|
|
70
|
+
Example:
|
|
71
|
+
>>> df_py = filter_by_entity_pattern(df, r".*\\.py$")
|
|
72
|
+
"""
|
|
73
|
+
mask = df[GitLogSchema.ENTITY].str.contains(pattern, na=False, regex=True)
|
|
74
|
+
return df[mask].copy()
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def apply_transformation(
|
|
78
|
+
df: pd.DataFrame, transform_fn: Callable[[pd.DataFrame], pd.DataFrame]
|
|
79
|
+
) -> pd.DataFrame:
|
|
80
|
+
"""Apply a transformation function to DataFrame.
|
|
81
|
+
|
|
82
|
+
Generic utility for applying transformations in pipeline.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
df: Input DataFrame
|
|
86
|
+
transform_fn: Function that takes and returns a DataFrame
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
Transformed DataFrame
|
|
90
|
+
"""
|
|
91
|
+
return transform_fn(df)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def validate_dataframe(df: pd.DataFrame) -> None:
|
|
95
|
+
"""Validate that DataFrame has required columns and types.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
df: DataFrame to validate
|
|
99
|
+
|
|
100
|
+
Raises:
|
|
101
|
+
ValueError: If DataFrame is invalid
|
|
102
|
+
"""
|
|
103
|
+
required_cols = set(GitLogSchema.columns())
|
|
104
|
+
actual_cols = set(df.columns)
|
|
105
|
+
|
|
106
|
+
if not required_cols.issubset(actual_cols):
|
|
107
|
+
missing = required_cols - actual_cols
|
|
108
|
+
raise ValueError(f"DataFrame missing required columns: {missing}")
|
|
109
|
+
|
|
110
|
+
# Check date column is datetime
|
|
111
|
+
if not pd.api.types.is_datetime64_any_dtype(df[GitLogSchema.DATE]):
|
|
112
|
+
raise ValueError(f"Column '{GitLogSchema.DATE}' must be datetime type")
|
|
File without changes
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
r"""Architectural grouper for mapping entities to logical boundaries.
|
|
2
|
+
|
|
3
|
+
This module provides functionality to map physical file paths to logical
|
|
4
|
+
architectural groups (layers, components, subsystems) before analysis.
|
|
5
|
+
|
|
6
|
+
Grouping is specified via text files with two pattern types:
|
|
7
|
+
1. Plain text paths: "src/Features/Core => Core"
|
|
8
|
+
- Automatically converted to regex: ^src/Features/Core/
|
|
9
|
+
2. Explicit regex: r"^src\/.*Test.*$ => Tests"
|
|
10
|
+
- Used as-is for flexible matching
|
|
11
|
+
|
|
12
|
+
First matching pattern wins (order matters). Unmapped entities are filtered out.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import re
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from re import Pattern
|
|
19
|
+
|
|
20
|
+
import pandas as pd
|
|
21
|
+
|
|
22
|
+
from code_maat_python.parser import GitLogSchema
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class GroupPattern:
|
|
27
|
+
"""A compiled grouping pattern.
|
|
28
|
+
|
|
29
|
+
Attributes:
|
|
30
|
+
pattern: Compiled regex pattern to match entity paths
|
|
31
|
+
logical_name: Logical group name to map to
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
pattern: Pattern[str]
|
|
35
|
+
logical_name: str
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def parse_group_specification(spec: str) -> list[GroupPattern]:
|
|
39
|
+
"""Parse a grouping specification into compiled patterns.
|
|
40
|
+
|
|
41
|
+
The specification format is:
|
|
42
|
+
path_pattern => logical_name
|
|
43
|
+
|
|
44
|
+
Where path_pattern can be:
|
|
45
|
+
- Plain text path (e.g., "src/Core") - auto-converted to ^src/Core/
|
|
46
|
+
- Explicit regex (e.g., r"^src\\/.*Test.*$") - used as-is
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
spec: Multi-line string with grouping specifications
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
List of compiled GroupPattern objects in specification order
|
|
53
|
+
|
|
54
|
+
Raises:
|
|
55
|
+
ValueError: If specification format is invalid or regex won't compile
|
|
56
|
+
|
|
57
|
+
Examples:
|
|
58
|
+
>>> spec = "src/Core => Core\\n^src\\\\/.*Test.*$ => Tests"
|
|
59
|
+
>>> patterns = parse_group_specification(spec)
|
|
60
|
+
>>> len(patterns)
|
|
61
|
+
2
|
|
62
|
+
"""
|
|
63
|
+
patterns: list[GroupPattern] = []
|
|
64
|
+
|
|
65
|
+
for line_num, line in enumerate(spec.split("\n"), start=1):
|
|
66
|
+
# Skip empty lines and whitespace
|
|
67
|
+
line = line.strip()
|
|
68
|
+
if not line:
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
# Check for separator
|
|
72
|
+
if "=>" not in line:
|
|
73
|
+
raise ValueError(
|
|
74
|
+
f"Invalid group specification at line {line_num}: "
|
|
75
|
+
f"Missing '=>' separator. Line: '{line}'"
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# Split on separator
|
|
79
|
+
parts = line.split("=>", maxsplit=1)
|
|
80
|
+
if len(parts) != 2:
|
|
81
|
+
raise ValueError(
|
|
82
|
+
f"Invalid group specification at line {line_num}: "
|
|
83
|
+
f"Expected 'pattern => name' format. Line: '{line}'"
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
path_pattern = parts[0].strip()
|
|
87
|
+
logical_name = parts[1].strip()
|
|
88
|
+
|
|
89
|
+
if not path_pattern or not logical_name:
|
|
90
|
+
raise ValueError(
|
|
91
|
+
f"Invalid group specification at line {line_num}: "
|
|
92
|
+
f"Pattern and name cannot be empty. Line: '{line}'"
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Determine pattern type and compile
|
|
96
|
+
try:
|
|
97
|
+
if path_pattern.startswith("^"):
|
|
98
|
+
# Explicit regex pattern - use as-is
|
|
99
|
+
compiled_pattern = re.compile(path_pattern)
|
|
100
|
+
else:
|
|
101
|
+
# Plain text path - convert to regex with trailing /
|
|
102
|
+
# Escape special regex characters, then add ^ and /
|
|
103
|
+
escaped = re.escape(path_pattern)
|
|
104
|
+
regex_str = f"^{escaped}/"
|
|
105
|
+
compiled_pattern = re.compile(regex_str)
|
|
106
|
+
except re.error as e:
|
|
107
|
+
raise ValueError(
|
|
108
|
+
f"Invalid regex pattern at line {line_num}: {path_pattern}. " f"Error: {str(e)}"
|
|
109
|
+
) from e
|
|
110
|
+
|
|
111
|
+
patterns.append(GroupPattern(pattern=compiled_pattern, logical_name=logical_name))
|
|
112
|
+
|
|
113
|
+
return patterns
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def map_entities_to_groups(df: pd.DataFrame, patterns: list[GroupPattern]) -> pd.DataFrame:
|
|
117
|
+
"""Map entity names to logical groups based on patterns.
|
|
118
|
+
|
|
119
|
+
Applies patterns in order, using first match. Entities that don't match
|
|
120
|
+
any pattern are filtered out (not passed through).
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
df: DataFrame with GitLogSchema columns
|
|
124
|
+
patterns: List of compiled GroupPattern objects
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
DataFrame with entity names replaced by logical group names.
|
|
128
|
+
Only includes rows where entity matched a pattern.
|
|
129
|
+
|
|
130
|
+
Examples:
|
|
131
|
+
>>> data = [{"entity": "src/Core/file.py", "author": "Alice", ...}]
|
|
132
|
+
>>> df = pd.DataFrame(data)
|
|
133
|
+
>>> spec = "src/Core => Core"
|
|
134
|
+
>>> patterns = parse_group_specification(spec)
|
|
135
|
+
>>> result = map_entities_to_groups(df, patterns)
|
|
136
|
+
>>> result.iloc[0]["entity"]
|
|
137
|
+
'Core'
|
|
138
|
+
"""
|
|
139
|
+
# Handle empty inputs
|
|
140
|
+
if df.empty:
|
|
141
|
+
# Return empty DataFrame with correct schema
|
|
142
|
+
return GitLogSchema.create_empty_dataframe()
|
|
143
|
+
|
|
144
|
+
if not patterns:
|
|
145
|
+
# No patterns = filter out everything
|
|
146
|
+
empty_df = df.iloc[:0].copy()
|
|
147
|
+
return empty_df
|
|
148
|
+
|
|
149
|
+
# Map each entity to its logical group (or None if no match)
|
|
150
|
+
def find_logical_group(entity: str) -> str | None:
|
|
151
|
+
"""Find the first matching logical group for an entity."""
|
|
152
|
+
for group_pattern in patterns:
|
|
153
|
+
if group_pattern.pattern.match(entity):
|
|
154
|
+
return group_pattern.logical_name
|
|
155
|
+
return None
|
|
156
|
+
|
|
157
|
+
# Apply mapping
|
|
158
|
+
entity_col = GitLogSchema.ENTITY
|
|
159
|
+
df_copy = df.copy()
|
|
160
|
+
df_copy["_logical_group"] = df_copy[entity_col].apply(find_logical_group)
|
|
161
|
+
|
|
162
|
+
# Filter out unmapped entities
|
|
163
|
+
df_filtered = df_copy[df_copy["_logical_group"].notna()].copy()
|
|
164
|
+
|
|
165
|
+
# Replace entity column with logical group
|
|
166
|
+
if not df_filtered.empty:
|
|
167
|
+
df_filtered[entity_col] = df_filtered["_logical_group"]
|
|
168
|
+
|
|
169
|
+
# Drop the temporary column
|
|
170
|
+
df_filtered = df_filtered.drop(columns=["_logical_group"])
|
|
171
|
+
|
|
172
|
+
return df_filtered
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def load_group_specification_file(filepath: Path | str) -> list[GroupPattern]:
|
|
176
|
+
"""Load and parse a grouping specification file.
|
|
177
|
+
|
|
178
|
+
Convenience function that reads a file and parses its contents.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
filepath: Path to grouping specification file
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
List of compiled GroupPattern objects
|
|
185
|
+
|
|
186
|
+
Raises:
|
|
187
|
+
FileNotFoundError: If file doesn't exist
|
|
188
|
+
ValueError: If file contents are invalid
|
|
189
|
+
|
|
190
|
+
Example:
|
|
191
|
+
>>> patterns = load_group_specification_file("layers.txt")
|
|
192
|
+
>>> len(patterns) > 0
|
|
193
|
+
True
|
|
194
|
+
"""
|
|
195
|
+
filepath = Path(filepath)
|
|
196
|
+
|
|
197
|
+
if not filepath.exists():
|
|
198
|
+
raise FileNotFoundError(f"Group specification file not found: {filepath}")
|
|
199
|
+
|
|
200
|
+
if not filepath.is_file():
|
|
201
|
+
raise ValueError(f"Not a file: {filepath}")
|
|
202
|
+
|
|
203
|
+
spec = filepath.read_text(encoding="utf-8")
|
|
204
|
+
return parse_group_specification(spec)
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"""Team mapper for aggregating author contributions to team level.
|
|
2
|
+
|
|
3
|
+
This module provides functionality to map individual authors to teams, allowing
|
|
4
|
+
analysis at the organizational team level rather than individual contributor level.
|
|
5
|
+
|
|
6
|
+
Team mappings are specified via CSV files with format:
|
|
7
|
+
author,team
|
|
8
|
+
John Doe,Backend Team
|
|
9
|
+
Jane Smith,Backend Team
|
|
10
|
+
|
|
11
|
+
Authors not in the mapping are preserved as-is (treated as individual "teams"),
|
|
12
|
+
which allows detection of missing mappings rather than silent data loss.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from io import StringIO
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
import pandas as pd
|
|
19
|
+
|
|
20
|
+
from code_maat_python.parser import GitLogSchema
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def parse_team_mapping_csv(csv_content: str) -> dict[str, str]:
|
|
24
|
+
"""Parse a team mapping CSV into author->team dictionary.
|
|
25
|
+
|
|
26
|
+
The CSV format expected:
|
|
27
|
+
author,team
|
|
28
|
+
John Doe,Backend Team
|
|
29
|
+
Jane Smith,Frontend Team
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
csv_content: CSV string with author,team columns
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
Dictionary mapping author names to team names
|
|
36
|
+
|
|
37
|
+
Raises:
|
|
38
|
+
ValueError: If CSV is missing required columns
|
|
39
|
+
|
|
40
|
+
Examples:
|
|
41
|
+
>>> csv = "author,team\\nAlice,Backend\\nBob,Frontend"
|
|
42
|
+
>>> mapping = parse_team_mapping_csv(csv)
|
|
43
|
+
>>> mapping["Alice"]
|
|
44
|
+
'Backend'
|
|
45
|
+
"""
|
|
46
|
+
# Parse CSV (keep_default_na=False to preserve empty strings)
|
|
47
|
+
df = pd.read_csv(StringIO(csv_content), keep_default_na=False)
|
|
48
|
+
|
|
49
|
+
# Validate required columns
|
|
50
|
+
required_cols = {"author", "team"}
|
|
51
|
+
if not required_cols.issubset(set(df.columns)):
|
|
52
|
+
missing = required_cols - set(df.columns)
|
|
53
|
+
raise ValueError(
|
|
54
|
+
f"Missing required columns in team mapping CSV: {missing}. "
|
|
55
|
+
f"Expected columns: {required_cols}, got: {set(df.columns)}"
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Build author->team lookup dictionary
|
|
59
|
+
# If there are duplicates, last one wins (pandas default behavior)
|
|
60
|
+
# strict=False allows different lengths (though they should match in valid CSV)
|
|
61
|
+
mapping = dict(zip(df["author"], df["team"], strict=False))
|
|
62
|
+
|
|
63
|
+
return mapping
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def map_authors_to_teams(df: pd.DataFrame, team_mapping: dict[str, str]) -> pd.DataFrame:
|
|
67
|
+
"""Map author names to team names based on mapping.
|
|
68
|
+
|
|
69
|
+
Authors not in the mapping are preserved as-is (not filtered out).
|
|
70
|
+
This allows detection of missing mappings.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
df: DataFrame with GitLogSchema columns
|
|
74
|
+
team_mapping: Dictionary mapping author names to team names
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
DataFrame with author column replaced by team names.
|
|
78
|
+
Authors not in mapping are kept unchanged.
|
|
79
|
+
|
|
80
|
+
Examples:
|
|
81
|
+
>>> data = [{"entity": "file.py", "author": "Alice", ...}]
|
|
82
|
+
>>> df = pd.DataFrame(data)
|
|
83
|
+
>>> mapping = {"Alice": "Backend Team"}
|
|
84
|
+
>>> result = map_authors_to_teams(df, mapping)
|
|
85
|
+
>>> result.iloc[0]["author"]
|
|
86
|
+
'Backend Team'
|
|
87
|
+
"""
|
|
88
|
+
# Handle empty inputs
|
|
89
|
+
if df.empty:
|
|
90
|
+
return GitLogSchema.create_empty_dataframe()
|
|
91
|
+
|
|
92
|
+
# Make a copy to avoid modifying original
|
|
93
|
+
result = df.copy()
|
|
94
|
+
|
|
95
|
+
# Map authors to teams, preserving unmapped authors
|
|
96
|
+
# Using .get(author, author) keeps original if not in mapping
|
|
97
|
+
author_col = GitLogSchema.AUTHOR
|
|
98
|
+
result[author_col] = result[author_col].map(lambda author: team_mapping.get(author, author))
|
|
99
|
+
|
|
100
|
+
return result
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def load_team_mapping_file(filepath: Path | str) -> dict[str, str]:
|
|
104
|
+
"""Load and parse a team mapping CSV file.
|
|
105
|
+
|
|
106
|
+
Convenience function that reads a file and parses its contents.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
filepath: Path to team mapping CSV file
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Dictionary mapping author names to team names
|
|
113
|
+
|
|
114
|
+
Raises:
|
|
115
|
+
FileNotFoundError: If file doesn't exist
|
|
116
|
+
ValueError: If file is invalid or missing required columns
|
|
117
|
+
|
|
118
|
+
Example:
|
|
119
|
+
>>> mapping = load_team_mapping_file("teams.csv")
|
|
120
|
+
>>> "Alice" in mapping
|
|
121
|
+
True
|
|
122
|
+
"""
|
|
123
|
+
filepath = Path(filepath)
|
|
124
|
+
|
|
125
|
+
if not filepath.exists():
|
|
126
|
+
raise FileNotFoundError(f"Team mapping file not found: {filepath}")
|
|
127
|
+
|
|
128
|
+
if not filepath.is_file():
|
|
129
|
+
raise ValueError(f"Not a file: {filepath}")
|
|
130
|
+
|
|
131
|
+
csv_content = filepath.read_text(encoding="utf-8")
|
|
132
|
+
return parse_team_mapping_csv(csv_content)
|