code-maat-python 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,146 @@
1
+ """Time-based grouper transformer.
2
+
3
+ Groups commits within time windows into logical changesets.
4
+ """
5
+
6
+ import pandas as pd
7
+
8
+ from code_maat_python.parser import GitLogSchema
9
+
10
+
11
+ def group_by_time(df: pd.DataFrame, window: str = "1D") -> pd.DataFrame:
12
+ """Group commits within a time window into logical changesets.
13
+
14
+ Takes temporal commits and groups them into logical changesets based on
15
+ a sliding time window. Within each window, entities are deduplicated
16
+ (keeping the first occurrence) and assigned a new revision ID based on
17
+ the window end date.
18
+
19
+ Args:
20
+ df: DataFrame with GitLogSchema columns (rev, date, author, entity, loc_added, loc_deleted)
21
+ window: Pandas frequency string (e.g., '1D', '2D', '1W', '7D')
22
+ Default: '1D' (1 day window)
23
+ See: https://pandas.pydata.org/docs/user_guide/timeseries.html#offset-aliases
24
+
25
+ Returns:
26
+ DataFrame with same schema, but commits grouped by time window.
27
+ Entities are deduplicated within windows (keeping first occurrence).
28
+ Revision IDs are set to window end dates in ISO format (YYYY-MM-DD).
29
+
30
+ Raises:
31
+ ValueError: If DataFrame is missing required columns or window format is invalid
32
+
33
+ Examples:
34
+ >>> data = [
35
+ ... {"entity": "file.py", "author": "Alice", "rev": "r1", "date": "2023-01-01"},
36
+ ... {"entity": "file.py", "author": "Bob", "rev": "r2", "date": "2023-01-01"},
37
+ ... ]
38
+ >>> df = pd.DataFrame(data)
39
+ >>> result = group_by_time(df, window="1D")
40
+ >>> len(result) # Deduplicated to 1 entity
41
+ 1
42
+ >>> result.iloc[0]["author"] # Keeps first occurrence
43
+ 'Alice'
44
+ >>> result.iloc[0]["rev"] # Window end date as revision
45
+ '2023-01-01'
46
+ """
47
+ # STEP 1: Handle empty DataFrame (ALWAYS FIRST)
48
+ if df.empty:
49
+ return GitLogSchema.create_empty_dataframe()
50
+
51
+ # STEP 2: Validate required columns
52
+ required_cols = set(GitLogSchema.columns())
53
+ if not required_cols.issubset(df.columns):
54
+ missing = required_cols - set(df.columns)
55
+ raise ValueError(f"DataFrame missing required columns: {missing}")
56
+
57
+ # STEP 3: Validate window parameter
58
+ if not isinstance(window, str):
59
+ raise ValueError(f"window must be a string, got {type(window).__name__}")
60
+ try:
61
+ pd.Timedelta(window) # Validates format
62
+ except ValueError as e:
63
+ raise ValueError(f"Invalid window format '{window}': {e}") from e
64
+
65
+ # STEP 4: Ensure date column is datetime (don't mutate input!)
66
+ if not pd.api.types.is_datetime64_any_dtype(df[GitLogSchema.DATE]):
67
+ df = df.copy()
68
+ df[GitLogSchema.DATE] = pd.to_datetime(df[GitLogSchema.DATE])
69
+
70
+ # STEP 5: Normalize dates to daily granularity (remove time component)
71
+ df_normalized = df.copy()
72
+ df_normalized[GitLogSchema.DATE] = pd.to_datetime(
73
+ df_normalized[GitLogSchema.DATE].dt.date
74
+ )
75
+
76
+ # STEP 7: Apply rolling window logic with smart deduplication
77
+ # Track entities and when they were last output to avoid duplicating
78
+ # entities in overlapping windows
79
+ results = []
80
+ window_delta = pd.Timedelta(window)
81
+ entity_last_output: dict[str, pd.Timestamp] = {} # entity -> last output date
82
+
83
+ # Process only dates that have actual commits
84
+ unique_dates = sorted(df_normalized[GitLogSchema.DATE].unique())
85
+
86
+ for current_date in unique_dates:
87
+ # Calculate window start (inclusive)
88
+ window_start = current_date - window_delta + pd.Timedelta("1D")
89
+
90
+ # Get all commits within this window
91
+ window_mask = (df_normalized[GitLogSchema.DATE] >= window_start) & (
92
+ df_normalized[GitLogSchema.DATE] <= current_date
93
+ )
94
+ window_data = df_normalized[window_mask]
95
+
96
+ if window_data.empty:
97
+ continue
98
+
99
+ # STEP 8: Deduplicate entities with keep='first' (STABLE SORT!)
100
+ unique_data = window_data.drop_duplicates(
101
+ subset=[GitLogSchema.ENTITY],
102
+ keep="first", # CRITICAL!
103
+ )
104
+
105
+ # STEP 9: Filter out entities already output in a recent overlapping window
106
+ # An entity should only be output again if its current window doesn't overlap
107
+ # with the window where it was last output
108
+ entities_to_output = []
109
+ for idx, row in unique_data.iterrows():
110
+ entity = row[GitLogSchema.ENTITY]
111
+ last_output_date = entity_last_output.get(entity)
112
+
113
+ if last_output_date is None:
114
+ # First time seeing this entity
115
+ entities_to_output.append(idx)
116
+ entity_last_output[entity] = current_date
117
+ else:
118
+ # Check if windows overlap
119
+ # Last window: [last_output_date - window_delta + 1D, last_output_date]
120
+ # Current window: [current_date - window_delta + 1D, current_date]
121
+ # They overlap if: last_output_date >= current_date - window_delta + 1D
122
+ current_window_start = current_date - window_delta + pd.Timedelta("1D")
123
+
124
+ # Windows overlap if last_output_date >= current_window_start
125
+ if last_output_date >= current_window_start:
126
+ # Overlapping window - skip this entity
127
+ continue
128
+ else:
129
+ # Non-overlapping window - output again
130
+ entities_to_output.append(idx)
131
+ entity_last_output[entity] = current_date
132
+
133
+ if entities_to_output:
134
+ output_data = unique_data.loc[entities_to_output].copy()
135
+ # Assign window end date as new revision ID
136
+ output_data[GitLogSchema.REV] = current_date.strftime("%Y-%m-%d")
137
+ results.append(output_data)
138
+
139
+ # STEP 10: Concatenate results
140
+ if not results:
141
+ return GitLogSchema.create_empty_dataframe()
142
+
143
+ result = pd.concat(results, ignore_index=True)
144
+
145
+ # STEP 11: Return with GitLogSchema column ordering
146
+ return result[GitLogSchema.columns()]
File without changes
@@ -0,0 +1,105 @@
1
+ """Mathematical utilities for analyses."""
2
+
3
+ from typing import Union
4
+
5
+ Number = Union[int, float]
6
+
7
+
8
+ def percentage(value: Number, total: Number) -> float:
9
+ """Calculate percentage.
10
+
11
+ Args:
12
+ value: Numerator value
13
+ total: Denominator value
14
+
15
+ Returns:
16
+ Percentage as float (0-100 range)
17
+
18
+ Raises:
19
+ ValueError: If total is zero
20
+
21
+ Example:
22
+ >>> percentage(25, 100)
23
+ 25.0
24
+ """
25
+ if total == 0:
26
+ raise ValueError("Cannot calculate percentage with zero total")
27
+ return (value / total) * 100.0
28
+
29
+
30
+ def average(*values: Number) -> float:
31
+ """Calculate average of values.
32
+
33
+ Args:
34
+ *values: Variable number of numeric values
35
+
36
+ Returns:
37
+ Average as float
38
+
39
+ Raises:
40
+ ValueError: If no values provided
41
+
42
+ Example:
43
+ >>> average(10, 20, 30)
44
+ 20.0
45
+ """
46
+ if not values:
47
+ raise ValueError("Cannot calculate average of zero values")
48
+ return sum(values) / len(values)
49
+
50
+
51
+ def fractal_value(contributions: list[Number]) -> float:
52
+ """Calculate fractal fragmentation value.
53
+
54
+ The fractal value measures how fragmented contributions are across
55
+ contributors. Formula: FV = 1 - Σ(ai/nc)²
56
+
57
+ Args:
58
+ contributions: List of contribution counts per contributor
59
+
60
+ Returns:
61
+ Fractal value in range [0, 1)
62
+ - 0: Single contributor (no fragmentation)
63
+ - →1: Highly fragmented across many contributors
64
+
65
+ Raises:
66
+ ValueError: If contributions list is empty
67
+
68
+ Example:
69
+ >>> fractal_value([100]) # Single contributor
70
+ 0.0
71
+ >>> fractal_value([50, 50]) # Two equal contributors
72
+ 0.5
73
+ >>> fractal_value([33, 33, 34]) # Three ~equal contributors
74
+ 0.6667
75
+ """
76
+ if not contributions:
77
+ raise ValueError("Cannot calculate fractal value with no contributions")
78
+
79
+ total = sum(contributions)
80
+ if total == 0:
81
+ return 0.0
82
+
83
+ return 1.0 - sum((count / total) ** 2 for count in contributions)
84
+
85
+
86
+ def clamp(value: Number, min_value: Number, max_value: Number) -> Number:
87
+ """Clamp value to range [min_value, max_value].
88
+
89
+ Args:
90
+ value: Value to clamp
91
+ min_value: Minimum allowed value
92
+ max_value: Maximum allowed value
93
+
94
+ Returns:
95
+ Clamped value
96
+
97
+ Example:
98
+ >>> clamp(150, 0, 100)
99
+ 100
100
+ >>> clamp(-10, 0, 100)
101
+ 0
102
+ >>> clamp(50, 0, 100)
103
+ 50
104
+ """
105
+ return max(min_value, min(value, max_value))