code-maat-python 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,232 @@
1
+ """Git log parser for code-maat-python.
2
+
3
+ Parses git log output into pandas DataFrames for analysis.
4
+ """
5
+
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from typing import Iterator, TextIO
9
+
10
+ import pandas as pd
11
+ import re
12
+
13
+
14
+ @dataclass
15
+ class CommitRecord:
16
+ """Represents a single commit with file changes."""
17
+
18
+ rev: str
19
+ date: str
20
+ author: str
21
+ entity: str
22
+ loc_added: int
23
+ loc_deleted: int
24
+
25
+
26
+ class GitLogSchema:
27
+ """Defines the DataFrame schema for parsed git logs."""
28
+
29
+ # Column names
30
+ REV = "rev"
31
+ DATE = "date"
32
+ AUTHOR = "author"
33
+ ENTITY = "entity"
34
+ LOC_ADDED = "loc_added"
35
+ LOC_DELETED = "loc_deleted"
36
+
37
+ # Column types for optimization
38
+ DTYPES = {
39
+ REV: "category",
40
+ AUTHOR: "category",
41
+ ENTITY: "category",
42
+ LOC_ADDED: "int32",
43
+ LOC_DELETED: "int32",
44
+ }
45
+
46
+ @classmethod
47
+ def columns(cls) -> list[str]:
48
+ """Return ordered list of column names."""
49
+ return [cls.REV, cls.DATE, cls.AUTHOR, cls.ENTITY, cls.LOC_ADDED, cls.LOC_DELETED]
50
+
51
+ @classmethod
52
+ def create_empty_dataframe(cls) -> pd.DataFrame:
53
+ """Create an empty DataFrame with the correct schema."""
54
+ df = pd.DataFrame(columns=cls.columns())
55
+ df[cls.DATE] = pd.to_datetime(df[cls.DATE])
56
+ return df
57
+
58
+
59
+ class GitLogParser:
60
+ """Parser for git log output.
61
+
62
+ Parses output from:
63
+ git log --all -M -C --numstat --date=short --pretty=format:'--%h--%cd--%cn'
64
+ """
65
+
66
+ # Regex patterns
67
+ COMMIT_PATTERN = re.compile(r"^--([a-f0-9]+)--(\d{4}-\d{2}-\d{2})--(.*)$")
68
+ CHANGE_PATTERN = re.compile(r"^(\d+|-)\t(\d+|-)\t(.+)$")
69
+
70
+ def __init__(self) -> None:
71
+ """Initialize the parser."""
72
+ self._current_commit: dict[str, str] | None = None
73
+ self._records: list[CommitRecord] = []
74
+
75
+ def parse_file(self, filepath: Path | str) -> pd.DataFrame:
76
+ """Parse a git log file.
77
+
78
+ Args:
79
+ filepath: Path to the git log file
80
+
81
+ Returns:
82
+ DataFrame with columns: rev, date, author, entity, loc_added, loc_deleted
83
+
84
+ Raises:
85
+ FileNotFoundError: If the file doesn't exist
86
+ ValueError: If the file format is invalid
87
+ """
88
+ filepath = Path(filepath)
89
+ if not filepath.exists():
90
+ raise FileNotFoundError(f"Git log file not found: {filepath}")
91
+
92
+ with open(filepath, "r", encoding="utf-8") as f:
93
+ return self.parse_stream(f)
94
+
95
+ def parse_stream(self, stream: TextIO) -> pd.DataFrame:
96
+ """Parse a git log from a stream.
97
+
98
+ Args:
99
+ stream: Text stream (file object) containing git log
100
+
101
+ Returns:
102
+ DataFrame with parsed commit data
103
+ """
104
+ self._current_commit = None
105
+ self._records = []
106
+
107
+ for line_num, line in enumerate(stream, start=1):
108
+ line = line.strip()
109
+
110
+ # Skip empty lines
111
+ if not line:
112
+ continue
113
+
114
+ # Try to parse as commit header
115
+ if self._try_parse_commit(line):
116
+ continue
117
+
118
+ # Try to parse as file change
119
+ if self._try_parse_change(line, line_num):
120
+ continue
121
+
122
+ # If we get here, the line didn't match any pattern
123
+ # This might be okay (e.g., continuation of author name with newline)
124
+ # But we should warn if it looks like data
125
+ if line.strip() and not line.startswith(" "):
126
+ print(f"Warning: Unparsed line {line_num}: {line[:50]}")
127
+
128
+ return self._to_dataframe()
129
+
130
+ def _try_parse_commit(self, line: str) -> bool:
131
+ """Try to parse line as commit header.
132
+
133
+ Args:
134
+ line: Line from git log
135
+
136
+ Returns:
137
+ True if line was a commit header, False otherwise
138
+ """
139
+ match = self.COMMIT_PATTERN.match(line)
140
+ if match:
141
+ self._current_commit = {
142
+ "rev": match.group(1),
143
+ "date": match.group(2),
144
+ "author": match.group(3).strip(),
145
+ }
146
+ return True
147
+ return False
148
+
149
+ def _try_parse_change(self, line: str, line_num: int) -> bool:
150
+ """Try to parse line as file change.
151
+
152
+ Args:
153
+ line: Line from git log
154
+ line_num: Line number (for error reporting)
155
+
156
+ Returns:
157
+ True if line was a file change, False otherwise
158
+ """
159
+ match = self.CHANGE_PATTERN.match(line)
160
+ if match and self._current_commit:
161
+ # Parse added/deleted, handling binary files (-)
162
+ loc_added = 0 if match.group(1) == "-" else int(match.group(1))
163
+ loc_deleted = 0 if match.group(2) == "-" else int(match.group(2))
164
+ entity = match.group(3)
165
+
166
+ record = CommitRecord(
167
+ rev=self._current_commit["rev"],
168
+ date=self._current_commit["date"],
169
+ author=self._current_commit["author"],
170
+ entity=entity,
171
+ loc_added=loc_added,
172
+ loc_deleted=loc_deleted,
173
+ )
174
+ self._records.append(record)
175
+ return True
176
+
177
+ if match and not self._current_commit:
178
+ print(f"Warning: File change without commit header at line {line_num}")
179
+ return False
180
+
181
+ return False
182
+
183
+ def _to_dataframe(self) -> pd.DataFrame:
184
+ """Convert parsed records to DataFrame.
185
+
186
+ Returns:
187
+ DataFrame with proper schema and types
188
+ """
189
+ if not self._records:
190
+ return GitLogSchema.create_empty_dataframe()
191
+
192
+ # Convert records to dict
193
+ data = {
194
+ GitLogSchema.REV: [r.rev for r in self._records],
195
+ GitLogSchema.DATE: [r.date for r in self._records],
196
+ GitLogSchema.AUTHOR: [r.author for r in self._records],
197
+ GitLogSchema.ENTITY: [r.entity for r in self._records],
198
+ GitLogSchema.LOC_ADDED: [r.loc_added for r in self._records],
199
+ GitLogSchema.LOC_DELETED: [r.loc_deleted for r in self._records],
200
+ }
201
+
202
+ df = pd.DataFrame(data)
203
+
204
+ # Convert date to datetime
205
+ df[GitLogSchema.DATE] = pd.to_datetime(df[GitLogSchema.DATE])
206
+
207
+ # Apply categorical types for memory efficiency
208
+ df[GitLogSchema.REV] = df[GitLogSchema.REV].astype("category")
209
+ df[GitLogSchema.AUTHOR] = df[GitLogSchema.AUTHOR].astype("category")
210
+ df[GitLogSchema.ENTITY] = df[GitLogSchema.ENTITY].astype("category")
211
+
212
+ return df
213
+
214
+
215
+ # Convenience function
216
+ def parse_git_log(filepath: Path | str) -> pd.DataFrame:
217
+ """Parse a git log file.
218
+
219
+ Convenience function for quick parsing.
220
+
221
+ Args:
222
+ filepath: Path to git log file
223
+
224
+ Returns:
225
+ DataFrame with parsed commit data
226
+
227
+ Example:
228
+ >>> df = parse_git_log("git.log")
229
+ >>> print(df.head())
230
+ """
231
+ parser = GitLogParser()
232
+ return parser.parse_file(filepath)
@@ -0,0 +1,112 @@
1
+ """Data pipeline utilities for code-maat-python.
2
+
3
+ Provides utilities for transforming and filtering DataFrames between
4
+ parsing and analysis stages.
5
+ """
6
+
7
+ import pandas as pd
8
+ from pathlib import Path
9
+ from typing import Callable
10
+
11
+ from code_maat_python.parser import parse_git_log, GitLogSchema
12
+
13
+
14
+ def load_git_log(filepath: Path | str) -> pd.DataFrame:
15
+ """Load and parse a git log file.
16
+
17
+ Convenience function that wraps the parser.
18
+
19
+ Args:
20
+ filepath: Path to git log file
21
+
22
+ Returns:
23
+ Parsed DataFrame
24
+
25
+ Raises:
26
+ FileNotFoundError: If file doesn't exist
27
+ """
28
+ return parse_git_log(filepath)
29
+
30
+
31
+ def filter_by_date(
32
+ df: pd.DataFrame, start_date: str | None = None, end_date: str | None = None
33
+ ) -> pd.DataFrame:
34
+ """Filter DataFrame by date range.
35
+
36
+ Args:
37
+ df: Input DataFrame
38
+ start_date: Start date (YYYY-MM-DD) or None for no start limit
39
+ end_date: End date (YYYY-MM-DD) or None for no end limit
40
+
41
+ Returns:
42
+ Filtered DataFrame
43
+
44
+ Example:
45
+ >>> df_filtered = filter_by_date(df, start_date="2023-01-01")
46
+ """
47
+ result = df.copy()
48
+
49
+ if start_date:
50
+ start = pd.to_datetime(start_date)
51
+ result = result[result[GitLogSchema.DATE] >= start]
52
+
53
+ if end_date:
54
+ end = pd.to_datetime(end_date)
55
+ result = result[result[GitLogSchema.DATE] <= end]
56
+
57
+ return result
58
+
59
+
60
+ def filter_by_entity_pattern(df: pd.DataFrame, pattern: str) -> pd.DataFrame:
61
+ """Filter DataFrame by entity path pattern.
62
+
63
+ Args:
64
+ df: Input DataFrame
65
+ pattern: Regex pattern to match against entity paths
66
+
67
+ Returns:
68
+ Filtered DataFrame
69
+
70
+ Example:
71
+ >>> df_py = filter_by_entity_pattern(df, r".*\\.py$")
72
+ """
73
+ mask = df[GitLogSchema.ENTITY].str.contains(pattern, na=False, regex=True)
74
+ return df[mask].copy()
75
+
76
+
77
+ def apply_transformation(
78
+ df: pd.DataFrame, transform_fn: Callable[[pd.DataFrame], pd.DataFrame]
79
+ ) -> pd.DataFrame:
80
+ """Apply a transformation function to DataFrame.
81
+
82
+ Generic utility for applying transformations in pipeline.
83
+
84
+ Args:
85
+ df: Input DataFrame
86
+ transform_fn: Function that takes and returns a DataFrame
87
+
88
+ Returns:
89
+ Transformed DataFrame
90
+ """
91
+ return transform_fn(df)
92
+
93
+
94
+ def validate_dataframe(df: pd.DataFrame) -> None:
95
+ """Validate that DataFrame has required columns and types.
96
+
97
+ Args:
98
+ df: DataFrame to validate
99
+
100
+ Raises:
101
+ ValueError: If DataFrame is invalid
102
+ """
103
+ required_cols = set(GitLogSchema.columns())
104
+ actual_cols = set(df.columns)
105
+
106
+ if not required_cols.issubset(actual_cols):
107
+ missing = required_cols - actual_cols
108
+ raise ValueError(f"DataFrame missing required columns: {missing}")
109
+
110
+ # Check date column is datetime
111
+ if not pd.api.types.is_datetime64_any_dtype(df[GitLogSchema.DATE]):
112
+ raise ValueError(f"Column '{GitLogSchema.DATE}' must be datetime type")
File without changes
@@ -0,0 +1,204 @@
1
+ r"""Architectural grouper for mapping entities to logical boundaries.
2
+
3
+ This module provides functionality to map physical file paths to logical
4
+ architectural groups (layers, components, subsystems) before analysis.
5
+
6
+ Grouping is specified via text files with two pattern types:
7
+ 1. Plain text paths: "src/Features/Core => Core"
8
+ - Automatically converted to regex: ^src/Features/Core/
9
+ 2. Explicit regex: r"^src\/.*Test.*$ => Tests"
10
+ - Used as-is for flexible matching
11
+
12
+ First matching pattern wins (order matters). Unmapped entities are filtered out.
13
+ """
14
+
15
+ import re
16
+ from dataclasses import dataclass
17
+ from pathlib import Path
18
+ from re import Pattern
19
+
20
+ import pandas as pd
21
+
22
+ from code_maat_python.parser import GitLogSchema
23
+
24
+
25
+ @dataclass
26
+ class GroupPattern:
27
+ """A compiled grouping pattern.
28
+
29
+ Attributes:
30
+ pattern: Compiled regex pattern to match entity paths
31
+ logical_name: Logical group name to map to
32
+ """
33
+
34
+ pattern: Pattern[str]
35
+ logical_name: str
36
+
37
+
38
+ def parse_group_specification(spec: str) -> list[GroupPattern]:
39
+ """Parse a grouping specification into compiled patterns.
40
+
41
+ The specification format is:
42
+ path_pattern => logical_name
43
+
44
+ Where path_pattern can be:
45
+ - Plain text path (e.g., "src/Core") - auto-converted to ^src/Core/
46
+ - Explicit regex (e.g., r"^src\\/.*Test.*$") - used as-is
47
+
48
+ Args:
49
+ spec: Multi-line string with grouping specifications
50
+
51
+ Returns:
52
+ List of compiled GroupPattern objects in specification order
53
+
54
+ Raises:
55
+ ValueError: If specification format is invalid or regex won't compile
56
+
57
+ Examples:
58
+ >>> spec = "src/Core => Core\\n^src\\\\/.*Test.*$ => Tests"
59
+ >>> patterns = parse_group_specification(spec)
60
+ >>> len(patterns)
61
+ 2
62
+ """
63
+ patterns: list[GroupPattern] = []
64
+
65
+ for line_num, line in enumerate(spec.split("\n"), start=1):
66
+ # Skip empty lines and whitespace
67
+ line = line.strip()
68
+ if not line:
69
+ continue
70
+
71
+ # Check for separator
72
+ if "=>" not in line:
73
+ raise ValueError(
74
+ f"Invalid group specification at line {line_num}: "
75
+ f"Missing '=>' separator. Line: '{line}'"
76
+ )
77
+
78
+ # Split on separator
79
+ parts = line.split("=>", maxsplit=1)
80
+ if len(parts) != 2:
81
+ raise ValueError(
82
+ f"Invalid group specification at line {line_num}: "
83
+ f"Expected 'pattern => name' format. Line: '{line}'"
84
+ )
85
+
86
+ path_pattern = parts[0].strip()
87
+ logical_name = parts[1].strip()
88
+
89
+ if not path_pattern or not logical_name:
90
+ raise ValueError(
91
+ f"Invalid group specification at line {line_num}: "
92
+ f"Pattern and name cannot be empty. Line: '{line}'"
93
+ )
94
+
95
+ # Determine pattern type and compile
96
+ try:
97
+ if path_pattern.startswith("^"):
98
+ # Explicit regex pattern - use as-is
99
+ compiled_pattern = re.compile(path_pattern)
100
+ else:
101
+ # Plain text path - convert to regex with trailing /
102
+ # Escape special regex characters, then add ^ and /
103
+ escaped = re.escape(path_pattern)
104
+ regex_str = f"^{escaped}/"
105
+ compiled_pattern = re.compile(regex_str)
106
+ except re.error as e:
107
+ raise ValueError(
108
+ f"Invalid regex pattern at line {line_num}: {path_pattern}. " f"Error: {str(e)}"
109
+ ) from e
110
+
111
+ patterns.append(GroupPattern(pattern=compiled_pattern, logical_name=logical_name))
112
+
113
+ return patterns
114
+
115
+
116
+ def map_entities_to_groups(df: pd.DataFrame, patterns: list[GroupPattern]) -> pd.DataFrame:
117
+ """Map entity names to logical groups based on patterns.
118
+
119
+ Applies patterns in order, using first match. Entities that don't match
120
+ any pattern are filtered out (not passed through).
121
+
122
+ Args:
123
+ df: DataFrame with GitLogSchema columns
124
+ patterns: List of compiled GroupPattern objects
125
+
126
+ Returns:
127
+ DataFrame with entity names replaced by logical group names.
128
+ Only includes rows where entity matched a pattern.
129
+
130
+ Examples:
131
+ >>> data = [{"entity": "src/Core/file.py", "author": "Alice", ...}]
132
+ >>> df = pd.DataFrame(data)
133
+ >>> spec = "src/Core => Core"
134
+ >>> patterns = parse_group_specification(spec)
135
+ >>> result = map_entities_to_groups(df, patterns)
136
+ >>> result.iloc[0]["entity"]
137
+ 'Core'
138
+ """
139
+ # Handle empty inputs
140
+ if df.empty:
141
+ # Return empty DataFrame with correct schema
142
+ return GitLogSchema.create_empty_dataframe()
143
+
144
+ if not patterns:
145
+ # No patterns = filter out everything
146
+ empty_df = df.iloc[:0].copy()
147
+ return empty_df
148
+
149
+ # Map each entity to its logical group (or None if no match)
150
+ def find_logical_group(entity: str) -> str | None:
151
+ """Find the first matching logical group for an entity."""
152
+ for group_pattern in patterns:
153
+ if group_pattern.pattern.match(entity):
154
+ return group_pattern.logical_name
155
+ return None
156
+
157
+ # Apply mapping
158
+ entity_col = GitLogSchema.ENTITY
159
+ df_copy = df.copy()
160
+ df_copy["_logical_group"] = df_copy[entity_col].apply(find_logical_group)
161
+
162
+ # Filter out unmapped entities
163
+ df_filtered = df_copy[df_copy["_logical_group"].notna()].copy()
164
+
165
+ # Replace entity column with logical group
166
+ if not df_filtered.empty:
167
+ df_filtered[entity_col] = df_filtered["_logical_group"]
168
+
169
+ # Drop the temporary column
170
+ df_filtered = df_filtered.drop(columns=["_logical_group"])
171
+
172
+ return df_filtered
173
+
174
+
175
+ def load_group_specification_file(filepath: Path | str) -> list[GroupPattern]:
176
+ """Load and parse a grouping specification file.
177
+
178
+ Convenience function that reads a file and parses its contents.
179
+
180
+ Args:
181
+ filepath: Path to grouping specification file
182
+
183
+ Returns:
184
+ List of compiled GroupPattern objects
185
+
186
+ Raises:
187
+ FileNotFoundError: If file doesn't exist
188
+ ValueError: If file contents are invalid
189
+
190
+ Example:
191
+ >>> patterns = load_group_specification_file("layers.txt")
192
+ >>> len(patterns) > 0
193
+ True
194
+ """
195
+ filepath = Path(filepath)
196
+
197
+ if not filepath.exists():
198
+ raise FileNotFoundError(f"Group specification file not found: {filepath}")
199
+
200
+ if not filepath.is_file():
201
+ raise ValueError(f"Not a file: {filepath}")
202
+
203
+ spec = filepath.read_text(encoding="utf-8")
204
+ return parse_group_specification(spec)
@@ -0,0 +1,132 @@
1
+ """Team mapper for aggregating author contributions to team level.
2
+
3
+ This module provides functionality to map individual authors to teams, allowing
4
+ analysis at the organizational team level rather than individual contributor level.
5
+
6
+ Team mappings are specified via CSV files with format:
7
+ author,team
8
+ John Doe,Backend Team
9
+ Jane Smith,Backend Team
10
+
11
+ Authors not in the mapping are preserved as-is (treated as individual "teams"),
12
+ which allows detection of missing mappings rather than silent data loss.
13
+ """
14
+
15
+ from io import StringIO
16
+ from pathlib import Path
17
+
18
+ import pandas as pd
19
+
20
+ from code_maat_python.parser import GitLogSchema
21
+
22
+
23
+ def parse_team_mapping_csv(csv_content: str) -> dict[str, str]:
24
+ """Parse a team mapping CSV into author->team dictionary.
25
+
26
+ The CSV format expected:
27
+ author,team
28
+ John Doe,Backend Team
29
+ Jane Smith,Frontend Team
30
+
31
+ Args:
32
+ csv_content: CSV string with author,team columns
33
+
34
+ Returns:
35
+ Dictionary mapping author names to team names
36
+
37
+ Raises:
38
+ ValueError: If CSV is missing required columns
39
+
40
+ Examples:
41
+ >>> csv = "author,team\\nAlice,Backend\\nBob,Frontend"
42
+ >>> mapping = parse_team_mapping_csv(csv)
43
+ >>> mapping["Alice"]
44
+ 'Backend'
45
+ """
46
+ # Parse CSV (keep_default_na=False to preserve empty strings)
47
+ df = pd.read_csv(StringIO(csv_content), keep_default_na=False)
48
+
49
+ # Validate required columns
50
+ required_cols = {"author", "team"}
51
+ if not required_cols.issubset(set(df.columns)):
52
+ missing = required_cols - set(df.columns)
53
+ raise ValueError(
54
+ f"Missing required columns in team mapping CSV: {missing}. "
55
+ f"Expected columns: {required_cols}, got: {set(df.columns)}"
56
+ )
57
+
58
+ # Build author->team lookup dictionary
59
+ # If there are duplicates, last one wins (pandas default behavior)
60
+ # strict=False allows different lengths (though they should match in valid CSV)
61
+ mapping = dict(zip(df["author"], df["team"], strict=False))
62
+
63
+ return mapping
64
+
65
+
66
+ def map_authors_to_teams(df: pd.DataFrame, team_mapping: dict[str, str]) -> pd.DataFrame:
67
+ """Map author names to team names based on mapping.
68
+
69
+ Authors not in the mapping are preserved as-is (not filtered out).
70
+ This allows detection of missing mappings.
71
+
72
+ Args:
73
+ df: DataFrame with GitLogSchema columns
74
+ team_mapping: Dictionary mapping author names to team names
75
+
76
+ Returns:
77
+ DataFrame with author column replaced by team names.
78
+ Authors not in mapping are kept unchanged.
79
+
80
+ Examples:
81
+ >>> data = [{"entity": "file.py", "author": "Alice", ...}]
82
+ >>> df = pd.DataFrame(data)
83
+ >>> mapping = {"Alice": "Backend Team"}
84
+ >>> result = map_authors_to_teams(df, mapping)
85
+ >>> result.iloc[0]["author"]
86
+ 'Backend Team'
87
+ """
88
+ # Handle empty inputs
89
+ if df.empty:
90
+ return GitLogSchema.create_empty_dataframe()
91
+
92
+ # Make a copy to avoid modifying original
93
+ result = df.copy()
94
+
95
+ # Map authors to teams, preserving unmapped authors
96
+ # Using .get(author, author) keeps original if not in mapping
97
+ author_col = GitLogSchema.AUTHOR
98
+ result[author_col] = result[author_col].map(lambda author: team_mapping.get(author, author))
99
+
100
+ return result
101
+
102
+
103
+ def load_team_mapping_file(filepath: Path | str) -> dict[str, str]:
104
+ """Load and parse a team mapping CSV file.
105
+
106
+ Convenience function that reads a file and parses its contents.
107
+
108
+ Args:
109
+ filepath: Path to team mapping CSV file
110
+
111
+ Returns:
112
+ Dictionary mapping author names to team names
113
+
114
+ Raises:
115
+ FileNotFoundError: If file doesn't exist
116
+ ValueError: If file is invalid or missing required columns
117
+
118
+ Example:
119
+ >>> mapping = load_team_mapping_file("teams.csv")
120
+ >>> "Alice" in mapping
121
+ True
122
+ """
123
+ filepath = Path(filepath)
124
+
125
+ if not filepath.exists():
126
+ raise FileNotFoundError(f"Team mapping file not found: {filepath}")
127
+
128
+ if not filepath.is_file():
129
+ raise ValueError(f"Not a file: {filepath}")
130
+
131
+ csv_content = filepath.read_text(encoding="utf-8")
132
+ return parse_team_mapping_csv(csv_content)