code-maat-python 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_maat_python/__init__.py +12 -0
- code_maat_python/__main__.py +5 -0
- code_maat_python/analyses/__init__.py +39 -0
- code_maat_python/analyses/age.py +101 -0
- code_maat_python/analyses/authors.py +60 -0
- code_maat_python/analyses/churn.py +353 -0
- code_maat_python/analyses/communication.py +151 -0
- code_maat_python/analyses/coupling.py +136 -0
- code_maat_python/analyses/effort.py +210 -0
- code_maat_python/analyses/entities.py +51 -0
- code_maat_python/analyses/revisions.py +56 -0
- code_maat_python/analyses/soc.py +90 -0
- code_maat_python/analyses/summary.py +61 -0
- code_maat_python/cli.py +822 -0
- code_maat_python/output/__init__.py +0 -0
- code_maat_python/parser.py +232 -0
- code_maat_python/pipeline.py +112 -0
- code_maat_python/transformers/__init__.py +0 -0
- code_maat_python/transformers/grouper.py +204 -0
- code_maat_python/transformers/team_mapper.py +132 -0
- code_maat_python/transformers/time_grouper.py +146 -0
- code_maat_python/utils/__init__.py +0 -0
- code_maat_python/utils/math.py +105 -0
- code_maat_python-0.1.0.dist-info/METADATA +545 -0
- code_maat_python-0.1.0.dist-info/RECORD +28 -0
- code_maat_python-0.1.0.dist-info/WHEEL +4 -0
- code_maat_python-0.1.0.dist-info/entry_points.txt +3 -0
- code_maat_python-0.1.0.dist-info/licenses/LICENSE +674 -0
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""Time-based grouper transformer.
|
|
2
|
+
|
|
3
|
+
Groups commits within time windows into logical changesets.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from code_maat_python.parser import GitLogSchema
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def group_by_time(df: pd.DataFrame, window: str = "1D") -> pd.DataFrame:
|
|
12
|
+
"""Group commits within a time window into logical changesets.
|
|
13
|
+
|
|
14
|
+
Takes temporal commits and groups them into logical changesets based on
|
|
15
|
+
a sliding time window. Within each window, entities are deduplicated
|
|
16
|
+
(keeping the first occurrence) and assigned a new revision ID based on
|
|
17
|
+
the window end date.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
df: DataFrame with GitLogSchema columns (rev, date, author, entity, loc_added, loc_deleted)
|
|
21
|
+
window: Pandas frequency string (e.g., '1D', '2D', '1W', '7D')
|
|
22
|
+
Default: '1D' (1 day window)
|
|
23
|
+
See: https://pandas.pydata.org/docs/user_guide/timeseries.html#offset-aliases
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
DataFrame with same schema, but commits grouped by time window.
|
|
27
|
+
Entities are deduplicated within windows (keeping first occurrence).
|
|
28
|
+
Revision IDs are set to window end dates in ISO format (YYYY-MM-DD).
|
|
29
|
+
|
|
30
|
+
Raises:
|
|
31
|
+
ValueError: If DataFrame is missing required columns or window format is invalid
|
|
32
|
+
|
|
33
|
+
Examples:
|
|
34
|
+
>>> data = [
|
|
35
|
+
... {"entity": "file.py", "author": "Alice", "rev": "r1", "date": "2023-01-01"},
|
|
36
|
+
... {"entity": "file.py", "author": "Bob", "rev": "r2", "date": "2023-01-01"},
|
|
37
|
+
... ]
|
|
38
|
+
>>> df = pd.DataFrame(data)
|
|
39
|
+
>>> result = group_by_time(df, window="1D")
|
|
40
|
+
>>> len(result) # Deduplicated to 1 entity
|
|
41
|
+
1
|
|
42
|
+
>>> result.iloc[0]["author"] # Keeps first occurrence
|
|
43
|
+
'Alice'
|
|
44
|
+
>>> result.iloc[0]["rev"] # Window end date as revision
|
|
45
|
+
'2023-01-01'
|
|
46
|
+
"""
|
|
47
|
+
# STEP 1: Handle empty DataFrame (ALWAYS FIRST)
|
|
48
|
+
if df.empty:
|
|
49
|
+
return GitLogSchema.create_empty_dataframe()
|
|
50
|
+
|
|
51
|
+
# STEP 2: Validate required columns
|
|
52
|
+
required_cols = set(GitLogSchema.columns())
|
|
53
|
+
if not required_cols.issubset(df.columns):
|
|
54
|
+
missing = required_cols - set(df.columns)
|
|
55
|
+
raise ValueError(f"DataFrame missing required columns: {missing}")
|
|
56
|
+
|
|
57
|
+
# STEP 3: Validate window parameter
|
|
58
|
+
if not isinstance(window, str):
|
|
59
|
+
raise ValueError(f"window must be a string, got {type(window).__name__}")
|
|
60
|
+
try:
|
|
61
|
+
pd.Timedelta(window) # Validates format
|
|
62
|
+
except ValueError as e:
|
|
63
|
+
raise ValueError(f"Invalid window format '{window}': {e}") from e
|
|
64
|
+
|
|
65
|
+
# STEP 4: Ensure date column is datetime (don't mutate input!)
|
|
66
|
+
if not pd.api.types.is_datetime64_any_dtype(df[GitLogSchema.DATE]):
|
|
67
|
+
df = df.copy()
|
|
68
|
+
df[GitLogSchema.DATE] = pd.to_datetime(df[GitLogSchema.DATE])
|
|
69
|
+
|
|
70
|
+
# STEP 5: Normalize dates to daily granularity (remove time component)
|
|
71
|
+
df_normalized = df.copy()
|
|
72
|
+
df_normalized[GitLogSchema.DATE] = pd.to_datetime(
|
|
73
|
+
df_normalized[GitLogSchema.DATE].dt.date
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# STEP 7: Apply rolling window logic with smart deduplication
|
|
77
|
+
# Track entities and when they were last output to avoid duplicating
|
|
78
|
+
# entities in overlapping windows
|
|
79
|
+
results = []
|
|
80
|
+
window_delta = pd.Timedelta(window)
|
|
81
|
+
entity_last_output: dict[str, pd.Timestamp] = {} # entity -> last output date
|
|
82
|
+
|
|
83
|
+
# Process only dates that have actual commits
|
|
84
|
+
unique_dates = sorted(df_normalized[GitLogSchema.DATE].unique())
|
|
85
|
+
|
|
86
|
+
for current_date in unique_dates:
|
|
87
|
+
# Calculate window start (inclusive)
|
|
88
|
+
window_start = current_date - window_delta + pd.Timedelta("1D")
|
|
89
|
+
|
|
90
|
+
# Get all commits within this window
|
|
91
|
+
window_mask = (df_normalized[GitLogSchema.DATE] >= window_start) & (
|
|
92
|
+
df_normalized[GitLogSchema.DATE] <= current_date
|
|
93
|
+
)
|
|
94
|
+
window_data = df_normalized[window_mask]
|
|
95
|
+
|
|
96
|
+
if window_data.empty:
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
# STEP 8: Deduplicate entities with keep='first' (STABLE SORT!)
|
|
100
|
+
unique_data = window_data.drop_duplicates(
|
|
101
|
+
subset=[GitLogSchema.ENTITY],
|
|
102
|
+
keep="first", # CRITICAL!
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# STEP 9: Filter out entities already output in a recent overlapping window
|
|
106
|
+
# An entity should only be output again if its current window doesn't overlap
|
|
107
|
+
# with the window where it was last output
|
|
108
|
+
entities_to_output = []
|
|
109
|
+
for idx, row in unique_data.iterrows():
|
|
110
|
+
entity = row[GitLogSchema.ENTITY]
|
|
111
|
+
last_output_date = entity_last_output.get(entity)
|
|
112
|
+
|
|
113
|
+
if last_output_date is None:
|
|
114
|
+
# First time seeing this entity
|
|
115
|
+
entities_to_output.append(idx)
|
|
116
|
+
entity_last_output[entity] = current_date
|
|
117
|
+
else:
|
|
118
|
+
# Check if windows overlap
|
|
119
|
+
# Last window: [last_output_date - window_delta + 1D, last_output_date]
|
|
120
|
+
# Current window: [current_date - window_delta + 1D, current_date]
|
|
121
|
+
# They overlap if: last_output_date >= current_date - window_delta + 1D
|
|
122
|
+
current_window_start = current_date - window_delta + pd.Timedelta("1D")
|
|
123
|
+
|
|
124
|
+
# Windows overlap if last_output_date >= current_window_start
|
|
125
|
+
if last_output_date >= current_window_start:
|
|
126
|
+
# Overlapping window - skip this entity
|
|
127
|
+
continue
|
|
128
|
+
else:
|
|
129
|
+
# Non-overlapping window - output again
|
|
130
|
+
entities_to_output.append(idx)
|
|
131
|
+
entity_last_output[entity] = current_date
|
|
132
|
+
|
|
133
|
+
if entities_to_output:
|
|
134
|
+
output_data = unique_data.loc[entities_to_output].copy()
|
|
135
|
+
# Assign window end date as new revision ID
|
|
136
|
+
output_data[GitLogSchema.REV] = current_date.strftime("%Y-%m-%d")
|
|
137
|
+
results.append(output_data)
|
|
138
|
+
|
|
139
|
+
# STEP 10: Concatenate results
|
|
140
|
+
if not results:
|
|
141
|
+
return GitLogSchema.create_empty_dataframe()
|
|
142
|
+
|
|
143
|
+
result = pd.concat(results, ignore_index=True)
|
|
144
|
+
|
|
145
|
+
# STEP 11: Return with GitLogSchema column ordering
|
|
146
|
+
return result[GitLogSchema.columns()]
|
|
File without changes
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""Mathematical utilities for analyses."""
|
|
2
|
+
|
|
3
|
+
from typing import Union
|
|
4
|
+
|
|
5
|
+
Number = Union[int, float]
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def percentage(value: Number, total: Number) -> float:
|
|
9
|
+
"""Calculate percentage.
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
value: Numerator value
|
|
13
|
+
total: Denominator value
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
Percentage as float (0-100 range)
|
|
17
|
+
|
|
18
|
+
Raises:
|
|
19
|
+
ValueError: If total is zero
|
|
20
|
+
|
|
21
|
+
Example:
|
|
22
|
+
>>> percentage(25, 100)
|
|
23
|
+
25.0
|
|
24
|
+
"""
|
|
25
|
+
if total == 0:
|
|
26
|
+
raise ValueError("Cannot calculate percentage with zero total")
|
|
27
|
+
return (value / total) * 100.0
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def average(*values: Number) -> float:
|
|
31
|
+
"""Calculate average of values.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
*values: Variable number of numeric values
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
Average as float
|
|
38
|
+
|
|
39
|
+
Raises:
|
|
40
|
+
ValueError: If no values provided
|
|
41
|
+
|
|
42
|
+
Example:
|
|
43
|
+
>>> average(10, 20, 30)
|
|
44
|
+
20.0
|
|
45
|
+
"""
|
|
46
|
+
if not values:
|
|
47
|
+
raise ValueError("Cannot calculate average of zero values")
|
|
48
|
+
return sum(values) / len(values)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def fractal_value(contributions: list[Number]) -> float:
|
|
52
|
+
"""Calculate fractal fragmentation value.
|
|
53
|
+
|
|
54
|
+
The fractal value measures how fragmented contributions are across
|
|
55
|
+
contributors. Formula: FV = 1 - Σ(ai/nc)²
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
contributions: List of contribution counts per contributor
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
Fractal value in range [0, 1)
|
|
62
|
+
- 0: Single contributor (no fragmentation)
|
|
63
|
+
- →1: Highly fragmented across many contributors
|
|
64
|
+
|
|
65
|
+
Raises:
|
|
66
|
+
ValueError: If contributions list is empty
|
|
67
|
+
|
|
68
|
+
Example:
|
|
69
|
+
>>> fractal_value([100]) # Single contributor
|
|
70
|
+
0.0
|
|
71
|
+
>>> fractal_value([50, 50]) # Two equal contributors
|
|
72
|
+
0.5
|
|
73
|
+
>>> fractal_value([33, 33, 34]) # Three ~equal contributors
|
|
74
|
+
0.6667
|
|
75
|
+
"""
|
|
76
|
+
if not contributions:
|
|
77
|
+
raise ValueError("Cannot calculate fractal value with no contributions")
|
|
78
|
+
|
|
79
|
+
total = sum(contributions)
|
|
80
|
+
if total == 0:
|
|
81
|
+
return 0.0
|
|
82
|
+
|
|
83
|
+
return 1.0 - sum((count / total) ** 2 for count in contributions)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def clamp(value: Number, min_value: Number, max_value: Number) -> Number:
|
|
87
|
+
"""Clamp value to range [min_value, max_value].
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
value: Value to clamp
|
|
91
|
+
min_value: Minimum allowed value
|
|
92
|
+
max_value: Maximum allowed value
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Clamped value
|
|
96
|
+
|
|
97
|
+
Example:
|
|
98
|
+
>>> clamp(150, 0, 100)
|
|
99
|
+
100
|
|
100
|
+
>>> clamp(-10, 0, 100)
|
|
101
|
+
0
|
|
102
|
+
>>> clamp(50, 0, 100)
|
|
103
|
+
50
|
|
104
|
+
"""
|
|
105
|
+
return max(min_value, min(value, max_value))
|