code-maat-python 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,12 @@
1
+ """
2
+ code-maat-python: Modern Python tool for mining VCS data with pandas.
3
+
4
+ A rewrite of Code Maat (by Adam Tornhill) using pandas for data analysis.
5
+ """
6
+
7
+ __version__ = "0.1.0"
8
+ __author__ = "Cameron Yick"
9
+ __license__ = "GPL-3.0"
10
+
11
+ # Version info
12
+ VERSION = __version__
@@ -0,0 +1,5 @@
1
+ """Allow running code-maat-python as a module with python -m code_maat_python"""
2
+ from code_maat_python.cli import main
3
+
4
+ if __name__ == "__main__":
5
+ main()
@@ -0,0 +1,39 @@
1
+ """Analysis modules for code-maat-python."""
2
+
3
+ from code_maat_python.analyses.authors import analyze_authors
4
+ from code_maat_python.analyses.churn import (
5
+ abs_churn,
6
+ author_churn,
7
+ entity_churn,
8
+ entity_ownership,
9
+ main_dev,
10
+ refactoring_main_dev,
11
+ )
12
+ from code_maat_python.analyses.coupling import analyze_coupling
13
+ from code_maat_python.analyses.effort import (
14
+ entity_effort,
15
+ fragmentation,
16
+ main_dev_by_revs,
17
+ )
18
+ from code_maat_python.analyses.entities import analyze_entities
19
+ from code_maat_python.analyses.revisions import analyze_revisions
20
+ from code_maat_python.analyses.soc import analyze_soc
21
+ from code_maat_python.analyses.summary import analyze_summary
22
+
23
+ __all__ = [
24
+ "abs_churn",
25
+ "analyze_authors",
26
+ "analyze_coupling",
27
+ "analyze_entities",
28
+ "analyze_revisions",
29
+ "analyze_soc",
30
+ "analyze_summary",
31
+ "author_churn",
32
+ "entity_churn",
33
+ "entity_effort",
34
+ "entity_ownership",
35
+ "fragmentation",
36
+ "main_dev",
37
+ "main_dev_by_revs",
38
+ "refactoring_main_dev",
39
+ ]
@@ -0,0 +1,101 @@
1
+ """Code age analysis for code-maat-python."""
2
+
3
+ import pandas as pd
4
+
5
+ from code_maat_python.parser import GitLogSchema
6
+
7
+
8
+ def code_age(df: pd.DataFrame, reference_date: pd.Timestamp | None = None) -> pd.DataFrame:
9
+ """Calculate age of entities in months since last modification.
10
+
11
+ This analysis helps identify stale code that hasn't been modified recently,
12
+ which may indicate technical debt, abandoned features, or stable components.
13
+
14
+ Args:
15
+ df: DataFrame with GitLogSchema columns (entity, rev, author, date, loc_added, loc_deleted).
16
+ Multiple commits to the same entity are allowed; the latest date is used.
17
+ reference_date: Reference date for age calculation. If None, uses current date.
18
+ Should be timezone-naive or match the timezone of dates in df.
19
+
20
+ Returns:
21
+ DataFrame with columns:
22
+ - entity: File path (str)
23
+ - age-months: Number of months since last modification (int)
24
+ Sorted by age-months descending (oldest entities first).
25
+
26
+ For empty input, returns empty DataFrame with correct schema.
27
+
28
+ Raises:
29
+ ValueError: If df doesn't have required columns (entity, date).
30
+
31
+ Example:
32
+ >>> import pandas as pd
33
+ >>> from code_maat_python.analyses.age import code_age
34
+ >>> from code_maat_python.parser import parse_git_log
35
+ >>>
36
+ >>> # Parse git log
37
+ >>> df = parse_git_log("git.log")
38
+ >>>
39
+ >>> # Calculate age using current date
40
+ >>> result = code_age(df)
41
+ >>> print(result.head())
42
+ entity age-months
43
+ 0 legacy_file.py 24
44
+ 1 old_module.py 18
45
+ 2 stable.py 12
46
+ >>>
47
+ >>> # Calculate age with custom reference date
48
+ >>> ref_date = pd.Timestamp('2023-12-31')
49
+ >>> result = code_age(df, reference_date=ref_date)
50
+ >>> print(result.head())
51
+ entity age-months
52
+ 0 legacy_file.py 18
53
+ 1 old_module.py 12
54
+
55
+ Note:
56
+ Age calculation uses 30.44 days per month (365.25/12) for accuracy.
57
+ Ages are rounded to the nearest whole month.
58
+ """
59
+ # 1. Handle empty DataFrame
60
+ if df.empty:
61
+ return pd.DataFrame(columns=["entity", "age-months"])
62
+
63
+ # 2. Validate required columns
64
+ required_cols = {GitLogSchema.ENTITY, GitLogSchema.DATE}
65
+ if not required_cols.issubset(df.columns):
66
+ missing = required_cols - set(df.columns)
67
+ raise ValueError(f"DataFrame missing required columns: {missing}")
68
+
69
+ # 3. Set reference_date to now if None
70
+ if reference_date is None:
71
+ reference_date = pd.Timestamp.now()
72
+
73
+ # 4. Ensure date column is datetime
74
+ if not pd.api.types.is_datetime64_any_dtype(df[GitLogSchema.DATE]):
75
+ df = df.copy()
76
+ df[GitLogSchema.DATE] = pd.to_datetime(df[GitLogSchema.DATE])
77
+
78
+ # 5. Find last modification date per entity (groupby + max)
79
+ last_dates = df.groupby(GitLogSchema.ENTITY, observed=True)[GitLogSchema.DATE].max()
80
+
81
+ # 6. Calculate age in months: (reference_date - last_date).days / 30.44
82
+ age_days = (reference_date - last_dates).dt.days
83
+ age_months = age_days / 30.44
84
+
85
+ # 7. Round to nearest month and convert to int
86
+ age_months = age_months.round().astype(int)
87
+
88
+ # 8. Create result DataFrame
89
+ result = pd.DataFrame(
90
+ {
91
+ "entity": last_dates.index,
92
+ "age-months": age_months.values,
93
+ }
94
+ )
95
+
96
+ # 9. Sort by age descending (oldest first)
97
+ result = result.sort_values(by=["age-months"], ascending=False)
98
+
99
+ # 10. Reset index and return with explicit column ordering
100
+ result = result.reset_index(drop=True)
101
+ return result[["entity", "age-months"]]
@@ -0,0 +1,60 @@
1
+ """Author analysis for code-maat-python.
2
+
3
+ Analyzes distinct authors per entity with revision counts.
4
+ """
5
+
6
+ import pandas as pd
7
+
8
+ from code_maat_python.parser import GitLogSchema
9
+
10
+
11
+ def analyze_authors(df: pd.DataFrame) -> pd.DataFrame:
12
+ """Count distinct authors per entity with revision counts.
13
+
14
+ Identifies how many authors have worked on each entity and how
15
+ many revisions each entity has undergone.
16
+
17
+ Args:
18
+ df: DataFrame with columns: entity, rev, author, date, loc_added, loc_deleted
19
+
20
+ Returns:
21
+ DataFrame with columns:
22
+ - entity: File or module path
23
+ - n-authors: Number of distinct authors
24
+ - n-revs: Number of revisions
25
+
26
+ Example:
27
+ >>> data = [
28
+ ... {'entity': 'src/main.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01'},
29
+ ... {'entity': 'src/main.py', 'rev': '2', 'author': 'Bob', 'date': '2023-01-02'},
30
+ ... {'entity': 'src/utils.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01'},
31
+ ... ]
32
+ >>> df = pd.DataFrame(data)
33
+ >>> result = analyze_authors(df)
34
+ >>> print(result)
35
+ entity n-authors n-revs
36
+ 0 src/main.py 2 2
37
+ 1 src/utils.py 1 1
38
+ """
39
+ # Handle empty DataFrame
40
+ if df.empty:
41
+ return pd.DataFrame(columns=["entity", "n-authors", "n-revs"])
42
+
43
+ # Group by entity and calculate metrics
44
+ result = (
45
+ df.groupby(GitLogSchema.ENTITY, observed=True)
46
+ .agg(
47
+ **{
48
+ "n-authors": (GitLogSchema.AUTHOR, "nunique"),
49
+ "n-revs": (GitLogSchema.REV, "nunique"),
50
+ }
51
+ )
52
+ .reset_index()
53
+ )
54
+
55
+ # Sort by number of authors descending, then by entity name
56
+ result = result.sort_values(
57
+ by=["n-authors", GitLogSchema.ENTITY], ascending=[False, True]
58
+ )
59
+
60
+ return result.reset_index(drop=True)
@@ -0,0 +1,353 @@
1
+ """Churn analysis for code-maat-python.
2
+
3
+ Analyzes code churn metrics including lines added/deleted by date, author,
4
+ and entity, as well as main developer identification.
5
+ """
6
+
7
+ import pandas as pd
8
+
9
+ from code_maat_python.parser import GitLogSchema
10
+
11
+
12
+ def abs_churn(df: pd.DataFrame) -> pd.DataFrame:
13
+ """Calculate absolute code churn per date.
14
+
15
+ Calculates the total lines added and deleted for each date in the
16
+ commit history. Only dates with commits are included.
17
+
18
+ Args:
19
+ df: DataFrame with columns: entity, rev, author, date, loc_added, loc_deleted
20
+
21
+ Returns:
22
+ DataFrame with columns:
23
+ - date: Commit date
24
+ - added: Total lines added on that date
25
+ - deleted: Total lines deleted on that date
26
+ - commits: Number of commits on that date
27
+
28
+ Example:
29
+ >>> data = [
30
+ ... {'entity': 'A', 'rev': '1', 'author': 'Dev1', 'date': '2023-01-01',
31
+ ... 'loc_added': 10, 'loc_deleted': 5},
32
+ ... {'entity': 'B', 'rev': '2', 'author': 'Dev2', 'date': '2023-01-01',
33
+ ... 'loc_added': 15, 'loc_deleted': 3},
34
+ ... ]
35
+ >>> df = pd.DataFrame(data)
36
+ >>> result = abs_churn(df)
37
+ """
38
+ # Handle empty DataFrame
39
+ if df.empty:
40
+ return pd.DataFrame(columns=["date", "added", "deleted", "commits"])
41
+
42
+ # Group by date and aggregate churn metrics
43
+ result = (
44
+ df.groupby(GitLogSchema.DATE, observed=True)
45
+ .agg(
46
+ added=(GitLogSchema.LOC_ADDED, "sum"),
47
+ deleted=(GitLogSchema.LOC_DELETED, "sum"),
48
+ commits=(GitLogSchema.REV, "nunique"),
49
+ )
50
+ .reset_index()
51
+ )
52
+
53
+ # Sort by date ascending
54
+ result = result.sort_values(by=[GitLogSchema.DATE], ascending=True)
55
+
56
+ return result.reset_index(drop=True)
57
+
58
+
59
+ def author_churn(df: pd.DataFrame) -> pd.DataFrame:
60
+ """Calculate total churn per author.
61
+
62
+ Sums the total lines added and deleted for each contributing author
63
+ across all entities and commits.
64
+
65
+ Args:
66
+ df: DataFrame with columns: entity, rev, author, date, loc_added, loc_deleted
67
+
68
+ Returns:
69
+ DataFrame with columns:
70
+ - author: Author name
71
+ - added: Total lines added by author
72
+ - deleted: Total lines deleted by author
73
+ - commits: Number of commits by author
74
+
75
+ Example:
76
+ >>> data = [
77
+ ... {'entity': 'A', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01',
78
+ ... 'loc_added': 10, 'loc_deleted': 5},
79
+ ... {'entity': 'B', 'rev': '2', 'author': 'Alice', 'date': '2023-01-02',
80
+ ... 'loc_added': 15, 'loc_deleted': 3},
81
+ ... ]
82
+ >>> df = pd.DataFrame(data)
83
+ >>> result = author_churn(df)
84
+ """
85
+ # Handle empty DataFrame
86
+ if df.empty:
87
+ return pd.DataFrame(columns=["author", "added", "deleted", "commits"])
88
+
89
+ # Group by author and aggregate churn metrics
90
+ result = (
91
+ df.groupby(GitLogSchema.AUTHOR, observed=True)
92
+ .agg(
93
+ added=(GitLogSchema.LOC_ADDED, "sum"),
94
+ deleted=(GitLogSchema.LOC_DELETED, "sum"),
95
+ commits=(GitLogSchema.REV, "nunique"),
96
+ )
97
+ .reset_index()
98
+ )
99
+
100
+ # Sort by author name ascending
101
+ result = result.sort_values(by=[GitLogSchema.AUTHOR], ascending=True)
102
+
103
+ return result.reset_index(drop=True)
104
+
105
+
106
+ def entity_churn(df: pd.DataFrame) -> pd.DataFrame:
107
+ """Calculate absolute churn per entity.
108
+
109
+ Returns the total lines added and deleted for each entity (file).
110
+ Entities are sorted by lines added in descending order, as this is
111
+ a better predictor of post-release defects.
112
+
113
+ Args:
114
+ df: DataFrame with columns: entity, rev, author, date, loc_added, loc_deleted
115
+
116
+ Returns:
117
+ DataFrame with columns:
118
+ - entity: File or module path
119
+ - added: Total lines added to entity
120
+ - deleted: Total lines deleted from entity
121
+ - commits: Number of commits affecting entity
122
+
123
+ Example:
124
+ >>> data = [
125
+ ... {'entity': 'main.py', 'rev': '1', 'author': 'Dev1', 'date': '2023-01-01',
126
+ ... 'loc_added': 100, 'loc_deleted': 10},
127
+ ... {'entity': 'main.py', 'rev': '2', 'author': 'Dev2', 'date': '2023-01-02',
128
+ ... 'loc_added': 50, 'loc_deleted': 5},
129
+ ... ]
130
+ >>> df = pd.DataFrame(data)
131
+ >>> result = entity_churn(df)
132
+ """
133
+ # Handle empty DataFrame
134
+ if df.empty:
135
+ return pd.DataFrame(columns=["entity", "added", "deleted", "commits"])
136
+
137
+ # Group by entity and aggregate churn metrics
138
+ result = (
139
+ df.groupby(GitLogSchema.ENTITY, observed=True)
140
+ .agg(
141
+ added=(GitLogSchema.LOC_ADDED, "sum"),
142
+ deleted=(GitLogSchema.LOC_DELETED, "sum"),
143
+ commits=(GitLogSchema.REV, "nunique"),
144
+ )
145
+ .reset_index()
146
+ )
147
+
148
+ # Sort by lines added descending (better predictor of defects)
149
+ result = result.sort_values(by=["added"], ascending=False)
150
+
151
+ return result.reset_index(drop=True)
152
+
153
+
154
+ def entity_ownership(df: pd.DataFrame) -> pd.DataFrame:
155
+ """Calculate ownership of each entity by author based on churn.
156
+
157
+ Returns a table showing how much each author has contributed to each
158
+ entity in terms of lines added and deleted. This helps identify code
159
+ ownership patterns.
160
+
161
+ Args:
162
+ df: DataFrame with columns: entity, rev, author, date, loc_added, loc_deleted
163
+
164
+ Returns:
165
+ DataFrame with columns:
166
+ - entity: File or module path
167
+ - author: Author name
168
+ - added: Lines added by author to entity
169
+ - deleted: Lines deleted by author from entity
170
+
171
+ Example:
172
+ >>> data = [
173
+ ... {'entity': 'main.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01',
174
+ ... 'loc_added': 100, 'loc_deleted': 10},
175
+ ... {'entity': 'main.py', 'rev': '2', 'author': 'Bob', 'date': '2023-01-02',
176
+ ... 'loc_added': 50, 'loc_deleted': 5},
177
+ ... ]
178
+ >>> df = pd.DataFrame(data)
179
+ >>> result = entity_ownership(df)
180
+ """
181
+ # Handle empty DataFrame
182
+ if df.empty:
183
+ return pd.DataFrame(columns=["entity", "author", "added", "deleted"])
184
+
185
+ # Group by entity and author, aggregate churn metrics
186
+ result = (
187
+ df.groupby([GitLogSchema.ENTITY, GitLogSchema.AUTHOR], observed=True)
188
+ .agg(
189
+ added=(GitLogSchema.LOC_ADDED, "sum"),
190
+ deleted=(GitLogSchema.LOC_DELETED, "sum"),
191
+ )
192
+ .reset_index()
193
+ )
194
+
195
+ # Sort by entity ascending
196
+ result = result.sort_values(by=[GitLogSchema.ENTITY], ascending=True)
197
+
198
+ return result.reset_index(drop=True)
199
+
200
+
201
+ def main_dev(df: pd.DataFrame) -> pd.DataFrame:
202
+ """Identify the main developer of each entity by lines added.
203
+
204
+ The main developer is the author who has contributed the most lines
205
+ of code (added) to each entity. Returns ownership percentage.
206
+
207
+ Args:
208
+ df: DataFrame with columns: entity, rev, author, date, loc_added, loc_deleted
209
+
210
+ Returns:
211
+ DataFrame with columns:
212
+ - entity: File or module path
213
+ - main-dev: Primary developer name
214
+ - added: Lines added by main developer
215
+ - total-added: Total lines added to entity
216
+ - ownership: Ownership percentage (0-100)
217
+
218
+ Example:
219
+ >>> data = [
220
+ ... {'entity': 'main.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01',
221
+ ... 'loc_added': 100, 'loc_deleted': 10},
222
+ ... {'entity': 'main.py', 'rev': '2', 'author': 'Bob', 'date': '2023-01-02',
223
+ ... 'loc_added': 50, 'loc_deleted': 5},
224
+ ... ]
225
+ >>> df = pd.DataFrame(data)
226
+ >>> result = main_dev(df)
227
+ """
228
+ # Handle empty DataFrame
229
+ if df.empty:
230
+ return pd.DataFrame(columns=["entity", "main-dev", "added", "total-added", "ownership"])
231
+
232
+ # Calculate lines added by each author per entity
233
+ entity_author_added = (
234
+ df.groupby([GitLogSchema.ENTITY, GitLogSchema.AUTHOR], observed=True)[
235
+ GitLogSchema.LOC_ADDED
236
+ ]
237
+ .sum()
238
+ .reset_index()
239
+ )
240
+
241
+ # Calculate total lines added per entity
242
+ entity_total_added = (
243
+ df.groupby(GitLogSchema.ENTITY, observed=True)[GitLogSchema.LOC_ADDED]
244
+ .sum()
245
+ .reset_index()
246
+ .rename(columns={GitLogSchema.LOC_ADDED: "total-added"})
247
+ )
248
+
249
+ # Find the author with max lines added per entity
250
+ main_devs = (
251
+ entity_author_added.loc[
252
+ entity_author_added.groupby(GitLogSchema.ENTITY, observed=True)[
253
+ GitLogSchema.LOC_ADDED
254
+ ].idxmax()
255
+ ]
256
+ .rename(
257
+ columns={
258
+ GitLogSchema.AUTHOR: "main-dev",
259
+ GitLogSchema.LOC_ADDED: "added",
260
+ }
261
+ )
262
+ .reset_index(drop=True)
263
+ )
264
+
265
+ # Merge with total added
266
+ result = main_devs.merge(entity_total_added, on=GitLogSchema.ENTITY)
267
+
268
+ # Calculate ownership percentage
269
+ # Use max(total, 1) to handle entities with only deletions
270
+ result["ownership"] = (result["added"] / result["total-added"].clip(lower=1) * 100).round(2)
271
+
272
+ # Sort by entity ascending
273
+ result = result.sort_values(by=[GitLogSchema.ENTITY], ascending=True)
274
+
275
+ return result.reset_index(drop=True)
276
+
277
+
278
+ def refactoring_main_dev(df: pd.DataFrame) -> pd.DataFrame:
279
+ """Identify the main developer of each entity by lines removed.
280
+
281
+ This alternative calculation identifies the main developer as the
282
+ author who has removed the most lines of code. The idea is that
283
+ removing code represents more active design choices and refactoring.
284
+
285
+ Args:
286
+ df: DataFrame with columns: entity, rev, author, date, loc_added, loc_deleted
287
+
288
+ Returns:
289
+ DataFrame with columns:
290
+ - entity: File or module path
291
+ - main-dev: Primary developer name
292
+ - removed: Lines removed by main developer
293
+ - total-removed: Total lines removed from entity
294
+ - ownership: Ownership percentage (0-100)
295
+
296
+ Example:
297
+ >>> data = [
298
+ ... {'entity': 'main.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01',
299
+ ... 'loc_added': 10, 'loc_deleted': 100},
300
+ ... {'entity': 'main.py', 'rev': '2', 'author': 'Bob', 'date': '2023-01-02',
301
+ ... 'loc_added': 5, 'loc_deleted': 50},
302
+ ... ]
303
+ >>> df = pd.DataFrame(data)
304
+ >>> result = refactoring_main_dev(df)
305
+ """
306
+ # Handle empty DataFrame
307
+ if df.empty:
308
+ return pd.DataFrame(columns=["entity", "main-dev", "removed", "total-removed", "ownership"])
309
+
310
+ # Calculate lines deleted by each author per entity
311
+ entity_author_deleted = (
312
+ df.groupby([GitLogSchema.ENTITY, GitLogSchema.AUTHOR], observed=True)[
313
+ GitLogSchema.LOC_DELETED
314
+ ]
315
+ .sum()
316
+ .reset_index()
317
+ )
318
+
319
+ # Calculate total lines deleted per entity
320
+ entity_total_deleted = (
321
+ df.groupby(GitLogSchema.ENTITY, observed=True)[GitLogSchema.LOC_DELETED]
322
+ .sum()
323
+ .reset_index()
324
+ .rename(columns={GitLogSchema.LOC_DELETED: "total-removed"})
325
+ )
326
+
327
+ # Find the author with max lines deleted per entity
328
+ main_devs = (
329
+ entity_author_deleted.loc[
330
+ entity_author_deleted.groupby(GitLogSchema.ENTITY, observed=True)[
331
+ GitLogSchema.LOC_DELETED
332
+ ].idxmax()
333
+ ]
334
+ .rename(
335
+ columns={
336
+ GitLogSchema.AUTHOR: "main-dev",
337
+ GitLogSchema.LOC_DELETED: "removed",
338
+ }
339
+ )
340
+ .reset_index(drop=True)
341
+ )
342
+
343
+ # Merge with total deleted
344
+ result = main_devs.merge(entity_total_deleted, on=GitLogSchema.ENTITY)
345
+
346
+ # Calculate ownership percentage
347
+ # Use max(total, 1) to handle entities with only additions
348
+ result["ownership"] = (result["removed"] / result["total-removed"].clip(lower=1) * 100).round(2)
349
+
350
+ # Sort by entity ascending
351
+ result = result.sort_values(by=[GitLogSchema.ENTITY], ascending=True)
352
+
353
+ return result.reset_index(drop=True)