code-maat-python 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_maat_python/__init__.py +12 -0
- code_maat_python/__main__.py +5 -0
- code_maat_python/analyses/__init__.py +39 -0
- code_maat_python/analyses/age.py +101 -0
- code_maat_python/analyses/authors.py +60 -0
- code_maat_python/analyses/churn.py +353 -0
- code_maat_python/analyses/communication.py +151 -0
- code_maat_python/analyses/coupling.py +136 -0
- code_maat_python/analyses/effort.py +210 -0
- code_maat_python/analyses/entities.py +51 -0
- code_maat_python/analyses/revisions.py +56 -0
- code_maat_python/analyses/soc.py +90 -0
- code_maat_python/analyses/summary.py +61 -0
- code_maat_python/cli.py +822 -0
- code_maat_python/output/__init__.py +0 -0
- code_maat_python/parser.py +232 -0
- code_maat_python/pipeline.py +112 -0
- code_maat_python/transformers/__init__.py +0 -0
- code_maat_python/transformers/grouper.py +204 -0
- code_maat_python/transformers/team_mapper.py +132 -0
- code_maat_python/transformers/time_grouper.py +146 -0
- code_maat_python/utils/__init__.py +0 -0
- code_maat_python/utils/math.py +105 -0
- code_maat_python-0.1.0.dist-info/METADATA +545 -0
- code_maat_python-0.1.0.dist-info/RECORD +28 -0
- code_maat_python-0.1.0.dist-info/WHEEL +4 -0
- code_maat_python-0.1.0.dist-info/entry_points.txt +3 -0
- code_maat_python-0.1.0.dist-info/licenses/LICENSE +674 -0
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""Communication analysis for code-maat-python."""
|
|
2
|
+
|
|
3
|
+
from itertools import combinations
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from code_maat_python.parser import GitLogSchema
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def communication(df: pd.DataFrame, min_shared: int = 5, min_coupling: int = 30) -> pd.DataFrame:
|
|
11
|
+
"""Calculate communication needs between developers based on shared entities.
|
|
12
|
+
|
|
13
|
+
This analysis identifies developers who work on the same code files, indicating
|
|
14
|
+
a need for communication and coordination. The strength metric normalizes by
|
|
15
|
+
each developer's total workload to avoid bias toward highly active developers.
|
|
16
|
+
|
|
17
|
+
CRITICAL IMPLEMENTATION NOTE: Self-pairs (author, author) are used internally
|
|
18
|
+
to track each author's total entity count for normalization, but are excluded
|
|
19
|
+
from the final output.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
df: DataFrame with GitLogSchema columns (entity, rev, author, date, loc_added, loc_deleted).
|
|
23
|
+
Multiple commits to the same entity by the same author are deduplicated.
|
|
24
|
+
min_shared: Minimum number of shared entities required for a pair to be included.
|
|
25
|
+
Default is 5.
|
|
26
|
+
min_coupling: Minimum coupling strength percentage required (0-100).
|
|
27
|
+
Default is 30. Strength is calculated as:
|
|
28
|
+
(shared_entities / average_total_entities) * 100
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
DataFrame with columns:
|
|
32
|
+
- author: First developer name (str), alphabetically sorted
|
|
33
|
+
- peer: Second developer name (str), alphabetically sorted
|
|
34
|
+
- shared: Number of entities both authors worked on (int)
|
|
35
|
+
- strength: Communication coupling strength percentage (int)
|
|
36
|
+
Sorted by strength descending, then shared descending.
|
|
37
|
+
|
|
38
|
+
Self-pairs (author, author) are NOT included in output.
|
|
39
|
+
For empty input or no pairs meeting thresholds, returns empty DataFrame with correct schema.
|
|
40
|
+
|
|
41
|
+
Raises:
|
|
42
|
+
ValueError: If df doesn't have required columns (entity, author).
|
|
43
|
+
|
|
44
|
+
Example:
|
|
45
|
+
>>> import pandas as pd
|
|
46
|
+
>>> from code_maat_python.analyses.communication import communication
|
|
47
|
+
>>> from code_maat_python.parser import parse_git_log
|
|
48
|
+
>>>
|
|
49
|
+
>>> # Parse git log
|
|
50
|
+
>>> df = parse_git_log("git.log")
|
|
51
|
+
>>>
|
|
52
|
+
>>> # Find developer pairs with high communication needs
|
|
53
|
+
>>> result = communication(df, min_shared=5, min_coupling=30)
|
|
54
|
+
>>> print(result.head())
|
|
55
|
+
author peer shared strength
|
|
56
|
+
0 Alice Bob 12 85
|
|
57
|
+
1 Alice Charlie 8 60
|
|
58
|
+
2 Bob Charlie 6 45
|
|
59
|
+
>>>
|
|
60
|
+
>>> # Use stricter thresholds
|
|
61
|
+
>>> result = communication(df, min_shared=10, min_coupling=50)
|
|
62
|
+
|
|
63
|
+
Note:
|
|
64
|
+
Strength calculation uses self-pairs internally to normalize:
|
|
65
|
+
- Self-pair (A, A) tracks total entities touched by author A
|
|
66
|
+
- Regular pair (A, B) tracks shared entities between A and B
|
|
67
|
+
- Strength = (shared / ((A_total + B_total) / 2)) * 100
|
|
68
|
+
- Self-pairs are excluded from final results
|
|
69
|
+
"""
|
|
70
|
+
# 1. Handle empty DataFrame
|
|
71
|
+
if df.empty:
|
|
72
|
+
return pd.DataFrame(columns=["author", "peer", "shared", "strength"])
|
|
73
|
+
|
|
74
|
+
# 2. Validate required columns
|
|
75
|
+
required_cols = {GitLogSchema.ENTITY, GitLogSchema.AUTHOR}
|
|
76
|
+
if not required_cols.issubset(df.columns):
|
|
77
|
+
missing = required_cols - set(df.columns)
|
|
78
|
+
raise ValueError(f"DataFrame missing required columns: {missing}")
|
|
79
|
+
|
|
80
|
+
# 3. Get unique author-entity pairs (deduplicate multiple commits)
|
|
81
|
+
unique_pairs = (
|
|
82
|
+
df[[GitLogSchema.ENTITY, GitLogSchema.AUTHOR]].drop_duplicates().reset_index(drop=True)
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# 4. Track pairs: (author1, author2) -> shared entity count
|
|
86
|
+
# Also track self-pairs: (author, author) -> total entity count for that author
|
|
87
|
+
pair_counts: dict[tuple[str, str], int] = {}
|
|
88
|
+
|
|
89
|
+
# 5. Calculate total entities per author (self-pairs for normalization)
|
|
90
|
+
author_totals = (
|
|
91
|
+
unique_pairs.groupby(GitLogSchema.AUTHOR, observed=True)[GitLogSchema.ENTITY]
|
|
92
|
+
.nunique()
|
|
93
|
+
.to_dict()
|
|
94
|
+
)
|
|
95
|
+
for author, total in author_totals.items():
|
|
96
|
+
pair_counts[(author, author)] = total
|
|
97
|
+
|
|
98
|
+
# 6. Find shared entities between author pairs
|
|
99
|
+
for _entity, group in unique_pairs.groupby(GitLogSchema.ENTITY, observed=True):
|
|
100
|
+
authors = sorted(group[GitLogSchema.AUTHOR].unique())
|
|
101
|
+
|
|
102
|
+
# Generate all pairs (excluding self-pairs for shared counts)
|
|
103
|
+
for author1, author2 in combinations(authors, 2):
|
|
104
|
+
# Keep alphabetical order
|
|
105
|
+
pair_key = (author1, author2)
|
|
106
|
+
pair_counts[pair_key] = pair_counts.get(pair_key, 0) + 1
|
|
107
|
+
|
|
108
|
+
# 7. Calculate communication strength for each pair
|
|
109
|
+
results = []
|
|
110
|
+
for (author1, author2), shared_count in pair_counts.items():
|
|
111
|
+
# Skip self-pairs in output (they were only for normalization)
|
|
112
|
+
if author1 == author2:
|
|
113
|
+
continue
|
|
114
|
+
|
|
115
|
+
# Get total entities for each author from self-pairs
|
|
116
|
+
author1_total = pair_counts.get((author1, author1), 0)
|
|
117
|
+
author2_total = pair_counts.get((author2, author2), 0)
|
|
118
|
+
|
|
119
|
+
# Calculate average work
|
|
120
|
+
avg_work = (author1_total + author2_total) / 2
|
|
121
|
+
|
|
122
|
+
# Protect against division by zero
|
|
123
|
+
if avg_work == 0:
|
|
124
|
+
continue
|
|
125
|
+
|
|
126
|
+
# Calculate strength percentage and round
|
|
127
|
+
strength = int(round((shared_count / avg_work) * 100))
|
|
128
|
+
|
|
129
|
+
# Apply filters (using rounded strength)
|
|
130
|
+
if shared_count >= min_shared and strength >= min_coupling:
|
|
131
|
+
results.append(
|
|
132
|
+
{
|
|
133
|
+
"author": author1,
|
|
134
|
+
"peer": author2,
|
|
135
|
+
"shared": shared_count,
|
|
136
|
+
"strength": strength,
|
|
137
|
+
}
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# 8. Create result DataFrame
|
|
141
|
+
if not results:
|
|
142
|
+
return pd.DataFrame(columns=["author", "peer", "shared", "strength"])
|
|
143
|
+
|
|
144
|
+
result_df = pd.DataFrame(results)
|
|
145
|
+
|
|
146
|
+
# 9. Sort by strength descending, then shared descending
|
|
147
|
+
result_df = result_df.sort_values(by=["strength", "shared"], ascending=[False, False])
|
|
148
|
+
|
|
149
|
+
# 10. Reset index and return with explicit column ordering
|
|
150
|
+
result_df = result_df.reset_index(drop=True)
|
|
151
|
+
return result_df[["author", "peer", "shared", "strength"]]
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""Logical coupling analysis for code-maat-python.
|
|
2
|
+
|
|
3
|
+
Calculate logical coupling between files based on co-change frequency.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from collections import Counter
|
|
7
|
+
from itertools import combinations
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
from code_maat_python.parser import GitLogSchema
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def analyze_coupling(
|
|
15
|
+
df: pd.DataFrame,
|
|
16
|
+
min_revs: int = 5,
|
|
17
|
+
min_shared_revs: int = 5,
|
|
18
|
+
min_coupling: int = 30,
|
|
19
|
+
max_coupling: int = 100,
|
|
20
|
+
max_changeset_size: int = 30,
|
|
21
|
+
) -> pd.DataFrame:
|
|
22
|
+
"""Calculate logical coupling between files.
|
|
23
|
+
|
|
24
|
+
Logical coupling identifies files that frequently change together,
|
|
25
|
+
which may indicate hidden dependencies, architectural issues, or
|
|
26
|
+
related functionality that should be refactored together.
|
|
27
|
+
|
|
28
|
+
Algorithm (from PYTHON_MIGRATION_PLAN.md lines 99-296):
|
|
29
|
+
1. Group commits by revision
|
|
30
|
+
2. Filter out large commits (> max_changeset_size files) BEFORE processing
|
|
31
|
+
3. Generate all 2-combinations of files per commit using itertools.combinations
|
|
32
|
+
4. Count co-occurrence frequencies
|
|
33
|
+
5. Calculate coupling degree: (shared_revs / avg_revs) * 100
|
|
34
|
+
6. Apply thresholds: min_revs, min_shared_revs, min_coupling, max_coupling
|
|
35
|
+
|
|
36
|
+
Critical notes:
|
|
37
|
+
- Large commits must be filtered BEFORE generating pairs (performance)
|
|
38
|
+
- Use itertools.combinations(files, 2) for efficiency
|
|
39
|
+
- Duplicate pairs are automatically handled by combinations (A,B) not (B,A)
|
|
40
|
+
- Average revisions = (entity1_revs + entity2_revs) / 2
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
df: DataFrame with columns: entity, rev, author, date, loc_added, loc_deleted
|
|
44
|
+
min_revs: Minimum average number of revisions for both modules (default: 5)
|
|
45
|
+
min_shared_revs: Minimum shared revisions between modules (default: 5)
|
|
46
|
+
min_coupling: Minimum coupling percentage threshold (default: 30)
|
|
47
|
+
max_coupling: Maximum coupling percentage threshold (default: 100)
|
|
48
|
+
max_changeset_size: Maximum number of files in a commit to consider (default: 30)
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
DataFrame with columns:
|
|
52
|
+
- entity: First file in the coupled pair
|
|
53
|
+
- coupled: Second file in the coupled pair
|
|
54
|
+
- degree: Coupling degree as integer percentage (0-100)
|
|
55
|
+
- average-revs: Average number of revisions as integer
|
|
56
|
+
Sorted by degree descending, then by entity and coupled names.
|
|
57
|
+
|
|
58
|
+
Example:
|
|
59
|
+
>>> data = [
|
|
60
|
+
... {'entity': 'src/main.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01'},
|
|
61
|
+
... {'entity': 'src/utils.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01'},
|
|
62
|
+
... {'entity': 'src/main.py', 'rev': '2', 'author': 'Bob', 'date': '2023-01-02'},
|
|
63
|
+
... {'entity': 'src/utils.py', 'rev': '2', 'author': 'Bob', 'date': '2023-01-02'},
|
|
64
|
+
... ]
|
|
65
|
+
>>> df = pd.DataFrame(data)
|
|
66
|
+
>>> result = analyze_coupling(df, min_revs=1, min_shared_revs=1, min_coupling=50)
|
|
67
|
+
>>> print(result)
|
|
68
|
+
entity coupled degree average-revs
|
|
69
|
+
0 src/main.py src/utils.py 100 2
|
|
70
|
+
"""
|
|
71
|
+
# Handle empty DataFrame
|
|
72
|
+
if df.empty:
|
|
73
|
+
return pd.DataFrame(columns=["entity", "coupled", "degree", "average-revs"])
|
|
74
|
+
|
|
75
|
+
# 1. Filter large commits BEFORE processing (critical for performance)
|
|
76
|
+
commit_sizes = df.groupby(GitLogSchema.REV, observed=True)[GitLogSchema.ENTITY].count()
|
|
77
|
+
valid_revs = commit_sizes[commit_sizes <= max_changeset_size].index
|
|
78
|
+
df_filtered = df[df[GitLogSchema.REV].isin(valid_revs)]
|
|
79
|
+
|
|
80
|
+
if df_filtered.empty:
|
|
81
|
+
return pd.DataFrame(columns=["entity", "coupled", "degree", "average-revs"])
|
|
82
|
+
|
|
83
|
+
# 2. Generate file pairs per commit using itertools.combinations
|
|
84
|
+
pairs: list[tuple[str, str]] = []
|
|
85
|
+
for rev, group in df_filtered.groupby(GitLogSchema.REV, observed=True):
|
|
86
|
+
files = sorted(group[GitLogSchema.ENTITY].unique())
|
|
87
|
+
if len(files) >= 2:
|
|
88
|
+
# Generate all 2-combinations (automatically handles duplicates)
|
|
89
|
+
pairs.extend(combinations(files, 2))
|
|
90
|
+
|
|
91
|
+
if not pairs:
|
|
92
|
+
return pd.DataFrame(columns=["entity", "coupled", "degree", "average-revs"])
|
|
93
|
+
|
|
94
|
+
# 3. Count co-occurrences
|
|
95
|
+
pair_counts = Counter(pairs)
|
|
96
|
+
|
|
97
|
+
# 4. Count revisions per module
|
|
98
|
+
module_revs = (
|
|
99
|
+
df_filtered.groupby(GitLogSchema.ENTITY, observed=True)[GitLogSchema.REV]
|
|
100
|
+
.nunique()
|
|
101
|
+
.to_dict()
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# 5. Calculate coupling degree and apply thresholds
|
|
105
|
+
results = []
|
|
106
|
+
for (entity1, entity2), shared_revs in pair_counts.items():
|
|
107
|
+
entity1_revs = module_revs[entity1]
|
|
108
|
+
entity2_revs = module_revs[entity2]
|
|
109
|
+
avg_revs = (entity1_revs + entity2_revs) / 2.0
|
|
110
|
+
coupling = (shared_revs / avg_revs) * 100.0
|
|
111
|
+
|
|
112
|
+
# 6. Apply all thresholds
|
|
113
|
+
if (
|
|
114
|
+
avg_revs >= min_revs
|
|
115
|
+
and shared_revs >= min_shared_revs
|
|
116
|
+
and min_coupling <= coupling <= max_coupling
|
|
117
|
+
):
|
|
118
|
+
results.append(
|
|
119
|
+
{
|
|
120
|
+
"entity": entity1,
|
|
121
|
+
"coupled": entity2,
|
|
122
|
+
"degree": int(coupling),
|
|
123
|
+
"average-revs": int(avg_revs),
|
|
124
|
+
}
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
if not results:
|
|
128
|
+
return pd.DataFrame(columns=["entity", "coupled", "degree", "average-revs"])
|
|
129
|
+
|
|
130
|
+
# Sort by degree descending, then by entity and coupled names
|
|
131
|
+
result_df = pd.DataFrame(results)
|
|
132
|
+
result_df = result_df.sort_values(
|
|
133
|
+
["degree", "entity", "coupled"], ascending=[False, True, True]
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
return result_df.reset_index(drop=True)
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
"""Effort analysis for code-maat-python.
|
|
2
|
+
|
|
3
|
+
Analyzes author contributions by revision count and calculates
|
|
4
|
+
code fragmentation metrics.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from code_maat_python.parser import GitLogSchema
|
|
10
|
+
from code_maat_python.utils.math import fractal_value
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def entity_effort(df: pd.DataFrame) -> pd.DataFrame:
|
|
14
|
+
"""Calculate author contribution to each entity by revision count.
|
|
15
|
+
|
|
16
|
+
Identifies how many revisions each author contributed to each entity,
|
|
17
|
+
providing a measure of effort that works for all VCS systems.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
df: DataFrame with columns: entity, rev, author, date, loc_added, loc_deleted
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
DataFrame with columns:
|
|
24
|
+
- entity: File or module path
|
|
25
|
+
- author: Author name
|
|
26
|
+
- author-revs: Number of revisions by author for this entity
|
|
27
|
+
- total-revs: Total revisions for this entity
|
|
28
|
+
|
|
29
|
+
Example:
|
|
30
|
+
>>> data = [
|
|
31
|
+
... {'entity': 'main.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01'},
|
|
32
|
+
... {'entity': 'main.py', 'rev': '2', 'author': 'Alice', 'date': '2023-01-02'},
|
|
33
|
+
... {'entity': 'main.py', 'rev': '3', 'author': 'Bob', 'date': '2023-01-03'},
|
|
34
|
+
... ]
|
|
35
|
+
>>> df = pd.DataFrame(data)
|
|
36
|
+
>>> result = entity_effort(df)
|
|
37
|
+
"""
|
|
38
|
+
# Handle empty DataFrame
|
|
39
|
+
if df.empty:
|
|
40
|
+
return pd.DataFrame(columns=["entity", "author", "author-revs", "total-revs"])
|
|
41
|
+
|
|
42
|
+
# Count revisions per entity-author combination
|
|
43
|
+
entity_author_revs = (
|
|
44
|
+
df.groupby([GitLogSchema.ENTITY, GitLogSchema.AUTHOR], observed=True)[GitLogSchema.REV]
|
|
45
|
+
.nunique()
|
|
46
|
+
.reset_index()
|
|
47
|
+
.rename(columns={GitLogSchema.REV: "author-revs"})
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# Count total revisions per entity
|
|
51
|
+
entity_total_revs = (
|
|
52
|
+
df.groupby(GitLogSchema.ENTITY, observed=True)[GitLogSchema.REV]
|
|
53
|
+
.nunique()
|
|
54
|
+
.reset_index()
|
|
55
|
+
.rename(columns={GitLogSchema.REV: "total-revs"})
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Merge to get complete picture
|
|
59
|
+
result = entity_author_revs.merge(entity_total_revs, on=GitLogSchema.ENTITY)
|
|
60
|
+
|
|
61
|
+
# Sort by entity ascending, then by author-revs descending (stable sort)
|
|
62
|
+
# This matches the Clojure implementation which uses stable sort
|
|
63
|
+
result = result.sort_values(by=["author-revs"], ascending=False) # First sort by revs
|
|
64
|
+
result = result.sort_values(
|
|
65
|
+
by=[GitLogSchema.ENTITY], ascending=True, kind="stable"
|
|
66
|
+
) # Then stable sort by entity
|
|
67
|
+
|
|
68
|
+
return result.reset_index(drop=True)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def main_dev_by_revs(df: pd.DataFrame) -> pd.DataFrame:
|
|
72
|
+
"""Identify the main developer of each entity by revision count.
|
|
73
|
+
|
|
74
|
+
The main developer is the author who has contributed the most revisions
|
|
75
|
+
to each entity. Returns ownership percentage based on revision count.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
df: DataFrame with columns: entity, rev, author, date, loc_added, loc_deleted
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
DataFrame with columns:
|
|
82
|
+
- entity: File or module path
|
|
83
|
+
- main-dev: Primary developer name
|
|
84
|
+
- added: Number of revisions by main developer
|
|
85
|
+
- total-added: Total revisions for entity
|
|
86
|
+
- ownership: Ownership percentage (0-100)
|
|
87
|
+
|
|
88
|
+
Example:
|
|
89
|
+
>>> data = [
|
|
90
|
+
... {'entity': 'main.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01'},
|
|
91
|
+
... {'entity': 'main.py', 'rev': '2', 'author': 'Alice', 'date': '2023-01-02'},
|
|
92
|
+
... {'entity': 'main.py', 'rev': '3', 'author': 'Bob', 'date': '2023-01-03'},
|
|
93
|
+
... ]
|
|
94
|
+
>>> df = pd.DataFrame(data)
|
|
95
|
+
>>> result = main_dev_by_revs(df)
|
|
96
|
+
"""
|
|
97
|
+
# Handle empty DataFrame
|
|
98
|
+
if df.empty:
|
|
99
|
+
return pd.DataFrame(columns=["entity", "main-dev", "added", "total-added", "ownership"])
|
|
100
|
+
|
|
101
|
+
# Count revisions per entity-author combination
|
|
102
|
+
entity_author_revs = (
|
|
103
|
+
df.groupby([GitLogSchema.ENTITY, GitLogSchema.AUTHOR], observed=True)[GitLogSchema.REV]
|
|
104
|
+
.nunique()
|
|
105
|
+
.reset_index()
|
|
106
|
+
.rename(columns={GitLogSchema.REV: "author-revs"})
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Count total revisions per entity
|
|
110
|
+
entity_total_revs = (
|
|
111
|
+
df.groupby(GitLogSchema.ENTITY, observed=True)[GitLogSchema.REV]
|
|
112
|
+
.nunique()
|
|
113
|
+
.reset_index()
|
|
114
|
+
.rename(columns={GitLogSchema.REV: "total-revs"})
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Find the author with max revisions per entity
|
|
118
|
+
main_devs = (
|
|
119
|
+
entity_author_revs.loc[
|
|
120
|
+
entity_author_revs.groupby(GitLogSchema.ENTITY, observed=True)["author-revs"].idxmax()
|
|
121
|
+
]
|
|
122
|
+
.rename(
|
|
123
|
+
columns={
|
|
124
|
+
GitLogSchema.AUTHOR: "main-dev",
|
|
125
|
+
"author-revs": "added",
|
|
126
|
+
}
|
|
127
|
+
)
|
|
128
|
+
.reset_index(drop=True)
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# Merge with total revisions
|
|
132
|
+
result = main_devs.merge(entity_total_revs, on=GitLogSchema.ENTITY)
|
|
133
|
+
result = result.rename(columns={"total-revs": "total-added"})
|
|
134
|
+
|
|
135
|
+
# Calculate ownership percentage
|
|
136
|
+
result["ownership"] = (result["added"] / result["total-added"] * 100).round(2)
|
|
137
|
+
|
|
138
|
+
# Sort by entity ascending
|
|
139
|
+
result = result.sort_values(by=[GitLogSchema.ENTITY], ascending=True)
|
|
140
|
+
|
|
141
|
+
return result.reset_index(drop=True)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def fragmentation(df: pd.DataFrame) -> pd.DataFrame:
|
|
145
|
+
"""Calculate fragmentation for each entity using fractal value.
|
|
146
|
+
|
|
147
|
+
The fractal value measures how fragmented contributions are across
|
|
148
|
+
authors. Formula: FV = 1 - Σ(ai/nc)²
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
df: DataFrame with columns: entity, rev, author, date, loc_added, loc_deleted
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
DataFrame with columns:
|
|
155
|
+
- entity: File or module path
|
|
156
|
+
- fractal-value: Fragmentation metric (0 to <1)
|
|
157
|
+
- 0: Single author (no fragmentation)
|
|
158
|
+
- →1: Highly fragmented across many authors
|
|
159
|
+
- total-revs: Total revisions for entity
|
|
160
|
+
|
|
161
|
+
Example:
|
|
162
|
+
>>> data = [
|
|
163
|
+
... {'entity': 'main.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01'},
|
|
164
|
+
... {'entity': 'main.py', 'rev': '2', 'author': 'Alice', 'date': '2023-01-02'},
|
|
165
|
+
... {'entity': 'utils.py', 'rev': '3', 'author': 'Alice', 'date': '2023-01-03'},
|
|
166
|
+
... {'entity': 'utils.py', 'rev': '4', 'author': 'Bob', 'date': '2023-01-04'},
|
|
167
|
+
... ]
|
|
168
|
+
>>> df = pd.DataFrame(data)
|
|
169
|
+
>>> result = fragmentation(df)
|
|
170
|
+
"""
|
|
171
|
+
# Handle empty DataFrame
|
|
172
|
+
if df.empty:
|
|
173
|
+
return pd.DataFrame(columns=["entity", "fractal-value", "total-revs"])
|
|
174
|
+
|
|
175
|
+
# Count revisions per entity-author combination
|
|
176
|
+
entity_author_revs = (
|
|
177
|
+
df.groupby([GitLogSchema.ENTITY, GitLogSchema.AUTHOR], observed=True)[GitLogSchema.REV]
|
|
178
|
+
.nunique()
|
|
179
|
+
.reset_index()
|
|
180
|
+
.rename(columns={GitLogSchema.REV: "author-revs"})
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# Count total revisions per entity
|
|
184
|
+
entity_total_revs = (
|
|
185
|
+
df.groupby(GitLogSchema.ENTITY, observed=True)[GitLogSchema.REV].nunique().to_dict()
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
# Calculate fractal value for each entity
|
|
189
|
+
results = []
|
|
190
|
+
for entity, group in entity_author_revs.groupby(GitLogSchema.ENTITY, observed=True):
|
|
191
|
+
contributions = group["author-revs"].tolist()
|
|
192
|
+
total_revs = entity_total_revs[entity]
|
|
193
|
+
|
|
194
|
+
# Use the fractal_value utility function
|
|
195
|
+
fv = fractal_value(contributions)
|
|
196
|
+
|
|
197
|
+
results.append(
|
|
198
|
+
{
|
|
199
|
+
"entity": entity,
|
|
200
|
+
"fractal-value": round(fv, 2),
|
|
201
|
+
"total-revs": total_revs,
|
|
202
|
+
}
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
result = pd.DataFrame(results)
|
|
206
|
+
|
|
207
|
+
# Sort by fractal-value descending, then by total-revs descending
|
|
208
|
+
result = result.sort_values(by=["fractal-value", "total-revs"], ascending=[False, False])
|
|
209
|
+
|
|
210
|
+
return result.reset_index(drop=True)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""Entity listing analysis for code-maat-python.
|
|
2
|
+
|
|
3
|
+
Lists all entities with basic statistics.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from code_maat_python.parser import GitLogSchema
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def analyze_entities(df: pd.DataFrame) -> pd.DataFrame:
|
|
12
|
+
"""List all entities with statistics.
|
|
13
|
+
|
|
14
|
+
Provides a simple listing of all entities (files) in the repository
|
|
15
|
+
with the number of times they appear in commits.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
df: DataFrame with columns: entity, rev, author, date, loc_added, loc_deleted
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
DataFrame with columns:
|
|
22
|
+
- entity: File or module path
|
|
23
|
+
- n-commits: Number of commit records (entity appearances)
|
|
24
|
+
Sorted by entity name.
|
|
25
|
+
|
|
26
|
+
Example:
|
|
27
|
+
>>> data = [
|
|
28
|
+
... {'entity': 'src/main.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01'},
|
|
29
|
+
... {'entity': 'src/main.py', 'rev': '2', 'author': 'Bob', 'date': '2023-01-02'},
|
|
30
|
+
... {'entity': 'src/utils.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01'},
|
|
31
|
+
... ]
|
|
32
|
+
>>> df = pd.DataFrame(data)
|
|
33
|
+
>>> result = analyze_entities(df)
|
|
34
|
+
>>> print(result)
|
|
35
|
+
entity n-commits
|
|
36
|
+
0 src/main.py 2
|
|
37
|
+
1 src/utils.py 1
|
|
38
|
+
"""
|
|
39
|
+
# Handle empty DataFrame
|
|
40
|
+
if df.empty:
|
|
41
|
+
return pd.DataFrame(columns=["entity", "n-commits"])
|
|
42
|
+
|
|
43
|
+
# Count occurrences of each entity
|
|
44
|
+
result = (
|
|
45
|
+
df.groupby(GitLogSchema.ENTITY, observed=True)
|
|
46
|
+
.size()
|
|
47
|
+
.reset_index(name="n-commits")
|
|
48
|
+
.sort_values("entity")
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
return result.reset_index(drop=True)
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Revision frequency analysis for code-maat-python.
|
|
2
|
+
|
|
3
|
+
Analyzes and sorts entities by their revision frequency.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from code_maat_python.parser import GitLogSchema
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def analyze_revisions(df: pd.DataFrame) -> pd.DataFrame:
|
|
12
|
+
"""Sort entities by revision frequency.
|
|
13
|
+
|
|
14
|
+
Counts the number of times each entity has been revised and sorts
|
|
15
|
+
by frequency in descending order. This helps identify the most
|
|
16
|
+
frequently changed files in a codebase.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
df: DataFrame with columns: entity, rev, author, date, loc_added, loc_deleted
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
DataFrame with columns:
|
|
23
|
+
- entity: File or module path
|
|
24
|
+
- n-revs: Number of revisions
|
|
25
|
+
Sorted by n-revs descending, then by entity name.
|
|
26
|
+
|
|
27
|
+
Example:
|
|
28
|
+
>>> data = [
|
|
29
|
+
... {'entity': 'src/main.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01'},
|
|
30
|
+
... {'entity': 'src/main.py', 'rev': '2', 'author': 'Bob', 'date': '2023-01-02'},
|
|
31
|
+
... {'entity': 'src/main.py', 'rev': '3', 'author': 'Alice', 'date': '2023-01-03'},
|
|
32
|
+
... {'entity': 'src/utils.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01'},
|
|
33
|
+
... ]
|
|
34
|
+
>>> df = pd.DataFrame(data)
|
|
35
|
+
>>> result = analyze_revisions(df)
|
|
36
|
+
>>> print(result)
|
|
37
|
+
entity n-revs
|
|
38
|
+
0 src/main.py 3
|
|
39
|
+
1 src/utils.py 1
|
|
40
|
+
"""
|
|
41
|
+
# Handle empty DataFrame
|
|
42
|
+
if df.empty:
|
|
43
|
+
return pd.DataFrame(columns=["entity", "n-revs"])
|
|
44
|
+
|
|
45
|
+
# Count unique revisions per entity
|
|
46
|
+
result = (
|
|
47
|
+
df.groupby(GitLogSchema.ENTITY, observed=True)[GitLogSchema.REV]
|
|
48
|
+
.nunique()
|
|
49
|
+
.reset_index()
|
|
50
|
+
)
|
|
51
|
+
result.columns = ["entity", "n-revs"]
|
|
52
|
+
|
|
53
|
+
# Sort by revision count descending, then by entity name
|
|
54
|
+
result = result.sort_values(by=["n-revs", "entity"], ascending=[False, True])
|
|
55
|
+
|
|
56
|
+
return result.reset_index(drop=True)
|