code-maat-python 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_maat_python/__init__.py +12 -0
- code_maat_python/__main__.py +5 -0
- code_maat_python/analyses/__init__.py +39 -0
- code_maat_python/analyses/age.py +101 -0
- code_maat_python/analyses/authors.py +60 -0
- code_maat_python/analyses/churn.py +353 -0
- code_maat_python/analyses/communication.py +151 -0
- code_maat_python/analyses/coupling.py +136 -0
- code_maat_python/analyses/effort.py +210 -0
- code_maat_python/analyses/entities.py +51 -0
- code_maat_python/analyses/revisions.py +56 -0
- code_maat_python/analyses/soc.py +90 -0
- code_maat_python/analyses/summary.py +61 -0
- code_maat_python/cli.py +822 -0
- code_maat_python/output/__init__.py +0 -0
- code_maat_python/parser.py +232 -0
- code_maat_python/pipeline.py +112 -0
- code_maat_python/transformers/__init__.py +0 -0
- code_maat_python/transformers/grouper.py +204 -0
- code_maat_python/transformers/team_mapper.py +132 -0
- code_maat_python/transformers/time_grouper.py +146 -0
- code_maat_python/utils/__init__.py +0 -0
- code_maat_python/utils/math.py +105 -0
- code_maat_python-0.1.0.dist-info/METADATA +545 -0
- code_maat_python-0.1.0.dist-info/RECORD +28 -0
- code_maat_python-0.1.0.dist-info/WHEEL +4 -0
- code_maat_python-0.1.0.dist-info/entry_points.txt +3 -0
- code_maat_python-0.1.0.dist-info/licenses/LICENSE +674 -0
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""
|
|
2
|
+
code-maat-python: Modern Python tool for mining VCS data with pandas.
|
|
3
|
+
|
|
4
|
+
A rewrite of Code Maat (by Adam Tornhill) using pandas for data analysis.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
__version__ = "0.1.0"
|
|
8
|
+
__author__ = "Cameron Yick"
|
|
9
|
+
__license__ = "GPL-3.0"
|
|
10
|
+
|
|
11
|
+
# Version info
|
|
12
|
+
VERSION = __version__
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Analysis modules for code-maat-python."""
|
|
2
|
+
|
|
3
|
+
from code_maat_python.analyses.authors import analyze_authors
|
|
4
|
+
from code_maat_python.analyses.churn import (
|
|
5
|
+
abs_churn,
|
|
6
|
+
author_churn,
|
|
7
|
+
entity_churn,
|
|
8
|
+
entity_ownership,
|
|
9
|
+
main_dev,
|
|
10
|
+
refactoring_main_dev,
|
|
11
|
+
)
|
|
12
|
+
from code_maat_python.analyses.coupling import analyze_coupling
|
|
13
|
+
from code_maat_python.analyses.effort import (
|
|
14
|
+
entity_effort,
|
|
15
|
+
fragmentation,
|
|
16
|
+
main_dev_by_revs,
|
|
17
|
+
)
|
|
18
|
+
from code_maat_python.analyses.entities import analyze_entities
|
|
19
|
+
from code_maat_python.analyses.revisions import analyze_revisions
|
|
20
|
+
from code_maat_python.analyses.soc import analyze_soc
|
|
21
|
+
from code_maat_python.analyses.summary import analyze_summary
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"abs_churn",
|
|
25
|
+
"analyze_authors",
|
|
26
|
+
"analyze_coupling",
|
|
27
|
+
"analyze_entities",
|
|
28
|
+
"analyze_revisions",
|
|
29
|
+
"analyze_soc",
|
|
30
|
+
"analyze_summary",
|
|
31
|
+
"author_churn",
|
|
32
|
+
"entity_churn",
|
|
33
|
+
"entity_effort",
|
|
34
|
+
"entity_ownership",
|
|
35
|
+
"fragmentation",
|
|
36
|
+
"main_dev",
|
|
37
|
+
"main_dev_by_revs",
|
|
38
|
+
"refactoring_main_dev",
|
|
39
|
+
]
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""Code age analysis for code-maat-python."""
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from code_maat_python.parser import GitLogSchema
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def code_age(df: pd.DataFrame, reference_date: pd.Timestamp | None = None) -> pd.DataFrame:
|
|
9
|
+
"""Calculate age of entities in months since last modification.
|
|
10
|
+
|
|
11
|
+
This analysis helps identify stale code that hasn't been modified recently,
|
|
12
|
+
which may indicate technical debt, abandoned features, or stable components.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
df: DataFrame with GitLogSchema columns (entity, rev, author, date, loc_added, loc_deleted).
|
|
16
|
+
Multiple commits to the same entity are allowed; the latest date is used.
|
|
17
|
+
reference_date: Reference date for age calculation. If None, uses current date.
|
|
18
|
+
Should be timezone-naive or match the timezone of dates in df.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
DataFrame with columns:
|
|
22
|
+
- entity: File path (str)
|
|
23
|
+
- age-months: Number of months since last modification (int)
|
|
24
|
+
Sorted by age-months descending (oldest entities first).
|
|
25
|
+
|
|
26
|
+
For empty input, returns empty DataFrame with correct schema.
|
|
27
|
+
|
|
28
|
+
Raises:
|
|
29
|
+
ValueError: If df doesn't have required columns (entity, date).
|
|
30
|
+
|
|
31
|
+
Example:
|
|
32
|
+
>>> import pandas as pd
|
|
33
|
+
>>> from code_maat_python.analyses.age import code_age
|
|
34
|
+
>>> from code_maat_python.parser import parse_git_log
|
|
35
|
+
>>>
|
|
36
|
+
>>> # Parse git log
|
|
37
|
+
>>> df = parse_git_log("git.log")
|
|
38
|
+
>>>
|
|
39
|
+
>>> # Calculate age using current date
|
|
40
|
+
>>> result = code_age(df)
|
|
41
|
+
>>> print(result.head())
|
|
42
|
+
entity age-months
|
|
43
|
+
0 legacy_file.py 24
|
|
44
|
+
1 old_module.py 18
|
|
45
|
+
2 stable.py 12
|
|
46
|
+
>>>
|
|
47
|
+
>>> # Calculate age with custom reference date
|
|
48
|
+
>>> ref_date = pd.Timestamp('2023-12-31')
|
|
49
|
+
>>> result = code_age(df, reference_date=ref_date)
|
|
50
|
+
>>> print(result.head())
|
|
51
|
+
entity age-months
|
|
52
|
+
0 legacy_file.py 18
|
|
53
|
+
1 old_module.py 12
|
|
54
|
+
|
|
55
|
+
Note:
|
|
56
|
+
Age calculation uses 30.44 days per month (365.25/12) for accuracy.
|
|
57
|
+
Ages are rounded to the nearest whole month.
|
|
58
|
+
"""
|
|
59
|
+
# 1. Handle empty DataFrame
|
|
60
|
+
if df.empty:
|
|
61
|
+
return pd.DataFrame(columns=["entity", "age-months"])
|
|
62
|
+
|
|
63
|
+
# 2. Validate required columns
|
|
64
|
+
required_cols = {GitLogSchema.ENTITY, GitLogSchema.DATE}
|
|
65
|
+
if not required_cols.issubset(df.columns):
|
|
66
|
+
missing = required_cols - set(df.columns)
|
|
67
|
+
raise ValueError(f"DataFrame missing required columns: {missing}")
|
|
68
|
+
|
|
69
|
+
# 3. Set reference_date to now if None
|
|
70
|
+
if reference_date is None:
|
|
71
|
+
reference_date = pd.Timestamp.now()
|
|
72
|
+
|
|
73
|
+
# 4. Ensure date column is datetime
|
|
74
|
+
if not pd.api.types.is_datetime64_any_dtype(df[GitLogSchema.DATE]):
|
|
75
|
+
df = df.copy()
|
|
76
|
+
df[GitLogSchema.DATE] = pd.to_datetime(df[GitLogSchema.DATE])
|
|
77
|
+
|
|
78
|
+
# 5. Find last modification date per entity (groupby + max)
|
|
79
|
+
last_dates = df.groupby(GitLogSchema.ENTITY, observed=True)[GitLogSchema.DATE].max()
|
|
80
|
+
|
|
81
|
+
# 6. Calculate age in months: (reference_date - last_date).days / 30.44
|
|
82
|
+
age_days = (reference_date - last_dates).dt.days
|
|
83
|
+
age_months = age_days / 30.44
|
|
84
|
+
|
|
85
|
+
# 7. Round to nearest month and convert to int
|
|
86
|
+
age_months = age_months.round().astype(int)
|
|
87
|
+
|
|
88
|
+
# 8. Create result DataFrame
|
|
89
|
+
result = pd.DataFrame(
|
|
90
|
+
{
|
|
91
|
+
"entity": last_dates.index,
|
|
92
|
+
"age-months": age_months.values,
|
|
93
|
+
}
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# 9. Sort by age descending (oldest first)
|
|
97
|
+
result = result.sort_values(by=["age-months"], ascending=False)
|
|
98
|
+
|
|
99
|
+
# 10. Reset index and return with explicit column ordering
|
|
100
|
+
result = result.reset_index(drop=True)
|
|
101
|
+
return result[["entity", "age-months"]]
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""Author analysis for code-maat-python.
|
|
2
|
+
|
|
3
|
+
Analyzes distinct authors per entity with revision counts.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from code_maat_python.parser import GitLogSchema
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def analyze_authors(df: pd.DataFrame) -> pd.DataFrame:
|
|
12
|
+
"""Count distinct authors per entity with revision counts.
|
|
13
|
+
|
|
14
|
+
Identifies how many authors have worked on each entity and how
|
|
15
|
+
many revisions each entity has undergone.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
df: DataFrame with columns: entity, rev, author, date, loc_added, loc_deleted
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
DataFrame with columns:
|
|
22
|
+
- entity: File or module path
|
|
23
|
+
- n-authors: Number of distinct authors
|
|
24
|
+
- n-revs: Number of revisions
|
|
25
|
+
|
|
26
|
+
Example:
|
|
27
|
+
>>> data = [
|
|
28
|
+
... {'entity': 'src/main.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01'},
|
|
29
|
+
... {'entity': 'src/main.py', 'rev': '2', 'author': 'Bob', 'date': '2023-01-02'},
|
|
30
|
+
... {'entity': 'src/utils.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01'},
|
|
31
|
+
... ]
|
|
32
|
+
>>> df = pd.DataFrame(data)
|
|
33
|
+
>>> result = analyze_authors(df)
|
|
34
|
+
>>> print(result)
|
|
35
|
+
entity n-authors n-revs
|
|
36
|
+
0 src/main.py 2 2
|
|
37
|
+
1 src/utils.py 1 1
|
|
38
|
+
"""
|
|
39
|
+
# Handle empty DataFrame
|
|
40
|
+
if df.empty:
|
|
41
|
+
return pd.DataFrame(columns=["entity", "n-authors", "n-revs"])
|
|
42
|
+
|
|
43
|
+
# Group by entity and calculate metrics
|
|
44
|
+
result = (
|
|
45
|
+
df.groupby(GitLogSchema.ENTITY, observed=True)
|
|
46
|
+
.agg(
|
|
47
|
+
**{
|
|
48
|
+
"n-authors": (GitLogSchema.AUTHOR, "nunique"),
|
|
49
|
+
"n-revs": (GitLogSchema.REV, "nunique"),
|
|
50
|
+
}
|
|
51
|
+
)
|
|
52
|
+
.reset_index()
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# Sort by number of authors descending, then by entity name
|
|
56
|
+
result = result.sort_values(
|
|
57
|
+
by=["n-authors", GitLogSchema.ENTITY], ascending=[False, True]
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
return result.reset_index(drop=True)
|
|
@@ -0,0 +1,353 @@
|
|
|
1
|
+
"""Churn analysis for code-maat-python.
|
|
2
|
+
|
|
3
|
+
Analyzes code churn metrics including lines added/deleted by date, author,
|
|
4
|
+
and entity, as well as main developer identification.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from code_maat_python.parser import GitLogSchema
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def abs_churn(df: pd.DataFrame) -> pd.DataFrame:
|
|
13
|
+
"""Calculate absolute code churn per date.
|
|
14
|
+
|
|
15
|
+
Calculates the total lines added and deleted for each date in the
|
|
16
|
+
commit history. Only dates with commits are included.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
df: DataFrame with columns: entity, rev, author, date, loc_added, loc_deleted
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
DataFrame with columns:
|
|
23
|
+
- date: Commit date
|
|
24
|
+
- added: Total lines added on that date
|
|
25
|
+
- deleted: Total lines deleted on that date
|
|
26
|
+
- commits: Number of commits on that date
|
|
27
|
+
|
|
28
|
+
Example:
|
|
29
|
+
>>> data = [
|
|
30
|
+
... {'entity': 'A', 'rev': '1', 'author': 'Dev1', 'date': '2023-01-01',
|
|
31
|
+
... 'loc_added': 10, 'loc_deleted': 5},
|
|
32
|
+
... {'entity': 'B', 'rev': '2', 'author': 'Dev2', 'date': '2023-01-01',
|
|
33
|
+
... 'loc_added': 15, 'loc_deleted': 3},
|
|
34
|
+
... ]
|
|
35
|
+
>>> df = pd.DataFrame(data)
|
|
36
|
+
>>> result = abs_churn(df)
|
|
37
|
+
"""
|
|
38
|
+
# Handle empty DataFrame
|
|
39
|
+
if df.empty:
|
|
40
|
+
return pd.DataFrame(columns=["date", "added", "deleted", "commits"])
|
|
41
|
+
|
|
42
|
+
# Group by date and aggregate churn metrics
|
|
43
|
+
result = (
|
|
44
|
+
df.groupby(GitLogSchema.DATE, observed=True)
|
|
45
|
+
.agg(
|
|
46
|
+
added=(GitLogSchema.LOC_ADDED, "sum"),
|
|
47
|
+
deleted=(GitLogSchema.LOC_DELETED, "sum"),
|
|
48
|
+
commits=(GitLogSchema.REV, "nunique"),
|
|
49
|
+
)
|
|
50
|
+
.reset_index()
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# Sort by date ascending
|
|
54
|
+
result = result.sort_values(by=[GitLogSchema.DATE], ascending=True)
|
|
55
|
+
|
|
56
|
+
return result.reset_index(drop=True)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def author_churn(df: pd.DataFrame) -> pd.DataFrame:
|
|
60
|
+
"""Calculate total churn per author.
|
|
61
|
+
|
|
62
|
+
Sums the total lines added and deleted for each contributing author
|
|
63
|
+
across all entities and commits.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
df: DataFrame with columns: entity, rev, author, date, loc_added, loc_deleted
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
DataFrame with columns:
|
|
70
|
+
- author: Author name
|
|
71
|
+
- added: Total lines added by author
|
|
72
|
+
- deleted: Total lines deleted by author
|
|
73
|
+
- commits: Number of commits by author
|
|
74
|
+
|
|
75
|
+
Example:
|
|
76
|
+
>>> data = [
|
|
77
|
+
... {'entity': 'A', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01',
|
|
78
|
+
... 'loc_added': 10, 'loc_deleted': 5},
|
|
79
|
+
... {'entity': 'B', 'rev': '2', 'author': 'Alice', 'date': '2023-01-02',
|
|
80
|
+
... 'loc_added': 15, 'loc_deleted': 3},
|
|
81
|
+
... ]
|
|
82
|
+
>>> df = pd.DataFrame(data)
|
|
83
|
+
>>> result = author_churn(df)
|
|
84
|
+
"""
|
|
85
|
+
# Handle empty DataFrame
|
|
86
|
+
if df.empty:
|
|
87
|
+
return pd.DataFrame(columns=["author", "added", "deleted", "commits"])
|
|
88
|
+
|
|
89
|
+
# Group by author and aggregate churn metrics
|
|
90
|
+
result = (
|
|
91
|
+
df.groupby(GitLogSchema.AUTHOR, observed=True)
|
|
92
|
+
.agg(
|
|
93
|
+
added=(GitLogSchema.LOC_ADDED, "sum"),
|
|
94
|
+
deleted=(GitLogSchema.LOC_DELETED, "sum"),
|
|
95
|
+
commits=(GitLogSchema.REV, "nunique"),
|
|
96
|
+
)
|
|
97
|
+
.reset_index()
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# Sort by author name ascending
|
|
101
|
+
result = result.sort_values(by=[GitLogSchema.AUTHOR], ascending=True)
|
|
102
|
+
|
|
103
|
+
return result.reset_index(drop=True)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def entity_churn(df: pd.DataFrame) -> pd.DataFrame:
|
|
107
|
+
"""Calculate absolute churn per entity.
|
|
108
|
+
|
|
109
|
+
Returns the total lines added and deleted for each entity (file).
|
|
110
|
+
Entities are sorted by lines added in descending order, as this is
|
|
111
|
+
a better predictor of post-release defects.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
df: DataFrame with columns: entity, rev, author, date, loc_added, loc_deleted
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
DataFrame with columns:
|
|
118
|
+
- entity: File or module path
|
|
119
|
+
- added: Total lines added to entity
|
|
120
|
+
- deleted: Total lines deleted from entity
|
|
121
|
+
- commits: Number of commits affecting entity
|
|
122
|
+
|
|
123
|
+
Example:
|
|
124
|
+
>>> data = [
|
|
125
|
+
... {'entity': 'main.py', 'rev': '1', 'author': 'Dev1', 'date': '2023-01-01',
|
|
126
|
+
... 'loc_added': 100, 'loc_deleted': 10},
|
|
127
|
+
... {'entity': 'main.py', 'rev': '2', 'author': 'Dev2', 'date': '2023-01-02',
|
|
128
|
+
... 'loc_added': 50, 'loc_deleted': 5},
|
|
129
|
+
... ]
|
|
130
|
+
>>> df = pd.DataFrame(data)
|
|
131
|
+
>>> result = entity_churn(df)
|
|
132
|
+
"""
|
|
133
|
+
# Handle empty DataFrame
|
|
134
|
+
if df.empty:
|
|
135
|
+
return pd.DataFrame(columns=["entity", "added", "deleted", "commits"])
|
|
136
|
+
|
|
137
|
+
# Group by entity and aggregate churn metrics
|
|
138
|
+
result = (
|
|
139
|
+
df.groupby(GitLogSchema.ENTITY, observed=True)
|
|
140
|
+
.agg(
|
|
141
|
+
added=(GitLogSchema.LOC_ADDED, "sum"),
|
|
142
|
+
deleted=(GitLogSchema.LOC_DELETED, "sum"),
|
|
143
|
+
commits=(GitLogSchema.REV, "nunique"),
|
|
144
|
+
)
|
|
145
|
+
.reset_index()
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
# Sort by lines added descending (better predictor of defects)
|
|
149
|
+
result = result.sort_values(by=["added"], ascending=False)
|
|
150
|
+
|
|
151
|
+
return result.reset_index(drop=True)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def entity_ownership(df: pd.DataFrame) -> pd.DataFrame:
|
|
155
|
+
"""Calculate ownership of each entity by author based on churn.
|
|
156
|
+
|
|
157
|
+
Returns a table showing how much each author has contributed to each
|
|
158
|
+
entity in terms of lines added and deleted. This helps identify code
|
|
159
|
+
ownership patterns.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
df: DataFrame with columns: entity, rev, author, date, loc_added, loc_deleted
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
DataFrame with columns:
|
|
166
|
+
- entity: File or module path
|
|
167
|
+
- author: Author name
|
|
168
|
+
- added: Lines added by author to entity
|
|
169
|
+
- deleted: Lines deleted by author from entity
|
|
170
|
+
|
|
171
|
+
Example:
|
|
172
|
+
>>> data = [
|
|
173
|
+
... {'entity': 'main.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01',
|
|
174
|
+
... 'loc_added': 100, 'loc_deleted': 10},
|
|
175
|
+
... {'entity': 'main.py', 'rev': '2', 'author': 'Bob', 'date': '2023-01-02',
|
|
176
|
+
... 'loc_added': 50, 'loc_deleted': 5},
|
|
177
|
+
... ]
|
|
178
|
+
>>> df = pd.DataFrame(data)
|
|
179
|
+
>>> result = entity_ownership(df)
|
|
180
|
+
"""
|
|
181
|
+
# Handle empty DataFrame
|
|
182
|
+
if df.empty:
|
|
183
|
+
return pd.DataFrame(columns=["entity", "author", "added", "deleted"])
|
|
184
|
+
|
|
185
|
+
# Group by entity and author, aggregate churn metrics
|
|
186
|
+
result = (
|
|
187
|
+
df.groupby([GitLogSchema.ENTITY, GitLogSchema.AUTHOR], observed=True)
|
|
188
|
+
.agg(
|
|
189
|
+
added=(GitLogSchema.LOC_ADDED, "sum"),
|
|
190
|
+
deleted=(GitLogSchema.LOC_DELETED, "sum"),
|
|
191
|
+
)
|
|
192
|
+
.reset_index()
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# Sort by entity ascending
|
|
196
|
+
result = result.sort_values(by=[GitLogSchema.ENTITY], ascending=True)
|
|
197
|
+
|
|
198
|
+
return result.reset_index(drop=True)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def main_dev(df: pd.DataFrame) -> pd.DataFrame:
|
|
202
|
+
"""Identify the main developer of each entity by lines added.
|
|
203
|
+
|
|
204
|
+
The main developer is the author who has contributed the most lines
|
|
205
|
+
of code (added) to each entity. Returns ownership percentage.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
df: DataFrame with columns: entity, rev, author, date, loc_added, loc_deleted
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
DataFrame with columns:
|
|
212
|
+
- entity: File or module path
|
|
213
|
+
- main-dev: Primary developer name
|
|
214
|
+
- added: Lines added by main developer
|
|
215
|
+
- total-added: Total lines added to entity
|
|
216
|
+
- ownership: Ownership percentage (0-100)
|
|
217
|
+
|
|
218
|
+
Example:
|
|
219
|
+
>>> data = [
|
|
220
|
+
... {'entity': 'main.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01',
|
|
221
|
+
... 'loc_added': 100, 'loc_deleted': 10},
|
|
222
|
+
... {'entity': 'main.py', 'rev': '2', 'author': 'Bob', 'date': '2023-01-02',
|
|
223
|
+
... 'loc_added': 50, 'loc_deleted': 5},
|
|
224
|
+
... ]
|
|
225
|
+
>>> df = pd.DataFrame(data)
|
|
226
|
+
>>> result = main_dev(df)
|
|
227
|
+
"""
|
|
228
|
+
# Handle empty DataFrame
|
|
229
|
+
if df.empty:
|
|
230
|
+
return pd.DataFrame(columns=["entity", "main-dev", "added", "total-added", "ownership"])
|
|
231
|
+
|
|
232
|
+
# Calculate lines added by each author per entity
|
|
233
|
+
entity_author_added = (
|
|
234
|
+
df.groupby([GitLogSchema.ENTITY, GitLogSchema.AUTHOR], observed=True)[
|
|
235
|
+
GitLogSchema.LOC_ADDED
|
|
236
|
+
]
|
|
237
|
+
.sum()
|
|
238
|
+
.reset_index()
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
# Calculate total lines added per entity
|
|
242
|
+
entity_total_added = (
|
|
243
|
+
df.groupby(GitLogSchema.ENTITY, observed=True)[GitLogSchema.LOC_ADDED]
|
|
244
|
+
.sum()
|
|
245
|
+
.reset_index()
|
|
246
|
+
.rename(columns={GitLogSchema.LOC_ADDED: "total-added"})
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
# Find the author with max lines added per entity
|
|
250
|
+
main_devs = (
|
|
251
|
+
entity_author_added.loc[
|
|
252
|
+
entity_author_added.groupby(GitLogSchema.ENTITY, observed=True)[
|
|
253
|
+
GitLogSchema.LOC_ADDED
|
|
254
|
+
].idxmax()
|
|
255
|
+
]
|
|
256
|
+
.rename(
|
|
257
|
+
columns={
|
|
258
|
+
GitLogSchema.AUTHOR: "main-dev",
|
|
259
|
+
GitLogSchema.LOC_ADDED: "added",
|
|
260
|
+
}
|
|
261
|
+
)
|
|
262
|
+
.reset_index(drop=True)
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
# Merge with total added
|
|
266
|
+
result = main_devs.merge(entity_total_added, on=GitLogSchema.ENTITY)
|
|
267
|
+
|
|
268
|
+
# Calculate ownership percentage
|
|
269
|
+
# Use max(total, 1) to handle entities with only deletions
|
|
270
|
+
result["ownership"] = (result["added"] / result["total-added"].clip(lower=1) * 100).round(2)
|
|
271
|
+
|
|
272
|
+
# Sort by entity ascending
|
|
273
|
+
result = result.sort_values(by=[GitLogSchema.ENTITY], ascending=True)
|
|
274
|
+
|
|
275
|
+
return result.reset_index(drop=True)
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def refactoring_main_dev(df: pd.DataFrame) -> pd.DataFrame:
|
|
279
|
+
"""Identify the main developer of each entity by lines removed.
|
|
280
|
+
|
|
281
|
+
This alternative calculation identifies the main developer as the
|
|
282
|
+
author who has removed the most lines of code. The idea is that
|
|
283
|
+
removing code represents more active design choices and refactoring.
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
df: DataFrame with columns: entity, rev, author, date, loc_added, loc_deleted
|
|
287
|
+
|
|
288
|
+
Returns:
|
|
289
|
+
DataFrame with columns:
|
|
290
|
+
- entity: File or module path
|
|
291
|
+
- main-dev: Primary developer name
|
|
292
|
+
- removed: Lines removed by main developer
|
|
293
|
+
- total-removed: Total lines removed from entity
|
|
294
|
+
- ownership: Ownership percentage (0-100)
|
|
295
|
+
|
|
296
|
+
Example:
|
|
297
|
+
>>> data = [
|
|
298
|
+
... {'entity': 'main.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01',
|
|
299
|
+
... 'loc_added': 10, 'loc_deleted': 100},
|
|
300
|
+
... {'entity': 'main.py', 'rev': '2', 'author': 'Bob', 'date': '2023-01-02',
|
|
301
|
+
... 'loc_added': 5, 'loc_deleted': 50},
|
|
302
|
+
... ]
|
|
303
|
+
>>> df = pd.DataFrame(data)
|
|
304
|
+
>>> result = refactoring_main_dev(df)
|
|
305
|
+
"""
|
|
306
|
+
# Handle empty DataFrame
|
|
307
|
+
if df.empty:
|
|
308
|
+
return pd.DataFrame(columns=["entity", "main-dev", "removed", "total-removed", "ownership"])
|
|
309
|
+
|
|
310
|
+
# Calculate lines deleted by each author per entity
|
|
311
|
+
entity_author_deleted = (
|
|
312
|
+
df.groupby([GitLogSchema.ENTITY, GitLogSchema.AUTHOR], observed=True)[
|
|
313
|
+
GitLogSchema.LOC_DELETED
|
|
314
|
+
]
|
|
315
|
+
.sum()
|
|
316
|
+
.reset_index()
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
# Calculate total lines deleted per entity
|
|
320
|
+
entity_total_deleted = (
|
|
321
|
+
df.groupby(GitLogSchema.ENTITY, observed=True)[GitLogSchema.LOC_DELETED]
|
|
322
|
+
.sum()
|
|
323
|
+
.reset_index()
|
|
324
|
+
.rename(columns={GitLogSchema.LOC_DELETED: "total-removed"})
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
# Find the author with max lines deleted per entity
|
|
328
|
+
main_devs = (
|
|
329
|
+
entity_author_deleted.loc[
|
|
330
|
+
entity_author_deleted.groupby(GitLogSchema.ENTITY, observed=True)[
|
|
331
|
+
GitLogSchema.LOC_DELETED
|
|
332
|
+
].idxmax()
|
|
333
|
+
]
|
|
334
|
+
.rename(
|
|
335
|
+
columns={
|
|
336
|
+
GitLogSchema.AUTHOR: "main-dev",
|
|
337
|
+
GitLogSchema.LOC_DELETED: "removed",
|
|
338
|
+
}
|
|
339
|
+
)
|
|
340
|
+
.reset_index(drop=True)
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
# Merge with total deleted
|
|
344
|
+
result = main_devs.merge(entity_total_deleted, on=GitLogSchema.ENTITY)
|
|
345
|
+
|
|
346
|
+
# Calculate ownership percentage
|
|
347
|
+
# Use max(total, 1) to handle entities with only additions
|
|
348
|
+
result["ownership"] = (result["removed"] / result["total-removed"].clip(lower=1) * 100).round(2)
|
|
349
|
+
|
|
350
|
+
# Sort by entity ascending
|
|
351
|
+
result = result.sort_values(by=[GitLogSchema.ENTITY], ascending=True)
|
|
352
|
+
|
|
353
|
+
return result.reset_index(drop=True)
|