code-maat-python 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,151 @@
1
+ """Communication analysis for code-maat-python."""
2
+
3
+ from itertools import combinations
4
+
5
+ import pandas as pd
6
+
7
+ from code_maat_python.parser import GitLogSchema
8
+
9
+
10
+ def communication(df: pd.DataFrame, min_shared: int = 5, min_coupling: int = 30) -> pd.DataFrame:
11
+ """Calculate communication needs between developers based on shared entities.
12
+
13
+ This analysis identifies developers who work on the same code files, indicating
14
+ a need for communication and coordination. The strength metric normalizes by
15
+ each developer's total workload to avoid bias toward highly active developers.
16
+
17
+ CRITICAL IMPLEMENTATION NOTE: Self-pairs (author, author) are used internally
18
+ to track each author's total entity count for normalization, but are excluded
19
+ from the final output.
20
+
21
+ Args:
22
+ df: DataFrame with GitLogSchema columns (entity, rev, author, date, loc_added, loc_deleted).
23
+ Multiple commits to the same entity by the same author are deduplicated.
24
+ min_shared: Minimum number of shared entities required for a pair to be included.
25
+ Default is 5.
26
+ min_coupling: Minimum coupling strength percentage required (0-100).
27
+ Default is 30. Strength is calculated as:
28
+ (shared_entities / average_total_entities) * 100
29
+
30
+ Returns:
31
+ DataFrame with columns:
32
+ - author: First developer name (str), alphabetically sorted
33
+ - peer: Second developer name (str), alphabetically sorted
34
+ - shared: Number of entities both authors worked on (int)
35
+ - strength: Communication coupling strength percentage (int)
36
+ Sorted by strength descending, then shared descending.
37
+
38
+ Self-pairs (author, author) are NOT included in output.
39
+ For empty input or no pairs meeting thresholds, returns empty DataFrame with correct schema.
40
+
41
+ Raises:
42
+ ValueError: If df doesn't have required columns (entity, author).
43
+
44
+ Example:
45
+ >>> import pandas as pd
46
+ >>> from code_maat_python.analyses.communication import communication
47
+ >>> from code_maat_python.parser import parse_git_log
48
+ >>>
49
+ >>> # Parse git log
50
+ >>> df = parse_git_log("git.log")
51
+ >>>
52
+ >>> # Find developer pairs with high communication needs
53
+ >>> result = communication(df, min_shared=5, min_coupling=30)
54
+ >>> print(result.head())
55
+ author peer shared strength
56
+ 0 Alice Bob 12 85
57
+ 1 Alice Charlie 8 60
58
+ 2 Bob Charlie 6 45
59
+ >>>
60
+ >>> # Use stricter thresholds
61
+ >>> result = communication(df, min_shared=10, min_coupling=50)
62
+
63
+ Note:
64
+ Strength calculation uses self-pairs internally to normalize:
65
+ - Self-pair (A, A) tracks total entities touched by author A
66
+ - Regular pair (A, B) tracks shared entities between A and B
67
+ - Strength = (shared / ((A_total + B_total) / 2)) * 100
68
+ - Self-pairs are excluded from final results
69
+ """
70
+ # 1. Handle empty DataFrame
71
+ if df.empty:
72
+ return pd.DataFrame(columns=["author", "peer", "shared", "strength"])
73
+
74
+ # 2. Validate required columns
75
+ required_cols = {GitLogSchema.ENTITY, GitLogSchema.AUTHOR}
76
+ if not required_cols.issubset(df.columns):
77
+ missing = required_cols - set(df.columns)
78
+ raise ValueError(f"DataFrame missing required columns: {missing}")
79
+
80
+ # 3. Get unique author-entity pairs (deduplicate multiple commits)
81
+ unique_pairs = (
82
+ df[[GitLogSchema.ENTITY, GitLogSchema.AUTHOR]].drop_duplicates().reset_index(drop=True)
83
+ )
84
+
85
+ # 4. Track pairs: (author1, author2) -> shared entity count
86
+ # Also track self-pairs: (author, author) -> total entity count for that author
87
+ pair_counts: dict[tuple[str, str], int] = {}
88
+
89
+ # 5. Calculate total entities per author (self-pairs for normalization)
90
+ author_totals = (
91
+ unique_pairs.groupby(GitLogSchema.AUTHOR, observed=True)[GitLogSchema.ENTITY]
92
+ .nunique()
93
+ .to_dict()
94
+ )
95
+ for author, total in author_totals.items():
96
+ pair_counts[(author, author)] = total
97
+
98
+ # 6. Find shared entities between author pairs
99
+ for _entity, group in unique_pairs.groupby(GitLogSchema.ENTITY, observed=True):
100
+ authors = sorted(group[GitLogSchema.AUTHOR].unique())
101
+
102
+ # Generate all pairs (excluding self-pairs for shared counts)
103
+ for author1, author2 in combinations(authors, 2):
104
+ # Keep alphabetical order
105
+ pair_key = (author1, author2)
106
+ pair_counts[pair_key] = pair_counts.get(pair_key, 0) + 1
107
+
108
+ # 7. Calculate communication strength for each pair
109
+ results = []
110
+ for (author1, author2), shared_count in pair_counts.items():
111
+ # Skip self-pairs in output (they were only for normalization)
112
+ if author1 == author2:
113
+ continue
114
+
115
+ # Get total entities for each author from self-pairs
116
+ author1_total = pair_counts.get((author1, author1), 0)
117
+ author2_total = pair_counts.get((author2, author2), 0)
118
+
119
+ # Calculate average work
120
+ avg_work = (author1_total + author2_total) / 2
121
+
122
+ # Protect against division by zero
123
+ if avg_work == 0:
124
+ continue
125
+
126
+ # Calculate strength percentage and round
127
+ strength = int(round((shared_count / avg_work) * 100))
128
+
129
+ # Apply filters (using rounded strength)
130
+ if shared_count >= min_shared and strength >= min_coupling:
131
+ results.append(
132
+ {
133
+ "author": author1,
134
+ "peer": author2,
135
+ "shared": shared_count,
136
+ "strength": strength,
137
+ }
138
+ )
139
+
140
+ # 8. Create result DataFrame
141
+ if not results:
142
+ return pd.DataFrame(columns=["author", "peer", "shared", "strength"])
143
+
144
+ result_df = pd.DataFrame(results)
145
+
146
+ # 9. Sort by strength descending, then shared descending
147
+ result_df = result_df.sort_values(by=["strength", "shared"], ascending=[False, False])
148
+
149
+ # 10. Reset index and return with explicit column ordering
150
+ result_df = result_df.reset_index(drop=True)
151
+ return result_df[["author", "peer", "shared", "strength"]]
@@ -0,0 +1,136 @@
1
+ """Logical coupling analysis for code-maat-python.
2
+
3
+ Calculate logical coupling between files based on co-change frequency.
4
+ """
5
+
6
+ from collections import Counter
7
+ from itertools import combinations
8
+
9
+ import pandas as pd
10
+
11
+ from code_maat_python.parser import GitLogSchema
12
+
13
+
14
+ def analyze_coupling(
15
+ df: pd.DataFrame,
16
+ min_revs: int = 5,
17
+ min_shared_revs: int = 5,
18
+ min_coupling: int = 30,
19
+ max_coupling: int = 100,
20
+ max_changeset_size: int = 30,
21
+ ) -> pd.DataFrame:
22
+ """Calculate logical coupling between files.
23
+
24
+ Logical coupling identifies files that frequently change together,
25
+ which may indicate hidden dependencies, architectural issues, or
26
+ related functionality that should be refactored together.
27
+
28
+ Algorithm (from PYTHON_MIGRATION_PLAN.md lines 99-296):
29
+ 1. Group commits by revision
30
+ 2. Filter out large commits (> max_changeset_size files) BEFORE processing
31
+ 3. Generate all 2-combinations of files per commit using itertools.combinations
32
+ 4. Count co-occurrence frequencies
33
+ 5. Calculate coupling degree: (shared_revs / avg_revs) * 100
34
+ 6. Apply thresholds: min_revs, min_shared_revs, min_coupling, max_coupling
35
+
36
+ Critical notes:
37
+ - Large commits must be filtered BEFORE generating pairs (performance)
38
+ - Use itertools.combinations(files, 2) for efficiency
39
+ - Duplicate pairs are automatically handled by combinations (A,B) not (B,A)
40
+ - Average revisions = (entity1_revs + entity2_revs) / 2
41
+
42
+ Args:
43
+ df: DataFrame with columns: entity, rev, author, date, loc_added, loc_deleted
44
+ min_revs: Minimum average number of revisions for both modules (default: 5)
45
+ min_shared_revs: Minimum shared revisions between modules (default: 5)
46
+ min_coupling: Minimum coupling percentage threshold (default: 30)
47
+ max_coupling: Maximum coupling percentage threshold (default: 100)
48
+ max_changeset_size: Maximum number of files in a commit to consider (default: 30)
49
+
50
+ Returns:
51
+ DataFrame with columns:
52
+ - entity: First file in the coupled pair
53
+ - coupled: Second file in the coupled pair
54
+ - degree: Coupling degree as integer percentage (0-100)
55
+ - average-revs: Average number of revisions as integer
56
+ Sorted by degree descending, then by entity and coupled names.
57
+
58
+ Example:
59
+ >>> data = [
60
+ ... {'entity': 'src/main.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01'},
61
+ ... {'entity': 'src/utils.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01'},
62
+ ... {'entity': 'src/main.py', 'rev': '2', 'author': 'Bob', 'date': '2023-01-02'},
63
+ ... {'entity': 'src/utils.py', 'rev': '2', 'author': 'Bob', 'date': '2023-01-02'},
64
+ ... ]
65
+ >>> df = pd.DataFrame(data)
66
+ >>> result = analyze_coupling(df, min_revs=1, min_shared_revs=1, min_coupling=50)
67
+ >>> print(result)
68
+ entity coupled degree average-revs
69
+ 0 src/main.py src/utils.py 100 2
70
+ """
71
+ # Handle empty DataFrame
72
+ if df.empty:
73
+ return pd.DataFrame(columns=["entity", "coupled", "degree", "average-revs"])
74
+
75
+ # 1. Filter large commits BEFORE processing (critical for performance)
76
+ commit_sizes = df.groupby(GitLogSchema.REV, observed=True)[GitLogSchema.ENTITY].count()
77
+ valid_revs = commit_sizes[commit_sizes <= max_changeset_size].index
78
+ df_filtered = df[df[GitLogSchema.REV].isin(valid_revs)]
79
+
80
+ if df_filtered.empty:
81
+ return pd.DataFrame(columns=["entity", "coupled", "degree", "average-revs"])
82
+
83
+ # 2. Generate file pairs per commit using itertools.combinations
84
+ pairs: list[tuple[str, str]] = []
85
+ for rev, group in df_filtered.groupby(GitLogSchema.REV, observed=True):
86
+ files = sorted(group[GitLogSchema.ENTITY].unique())
87
+ if len(files) >= 2:
88
+ # Generate all 2-combinations (automatically handles duplicates)
89
+ pairs.extend(combinations(files, 2))
90
+
91
+ if not pairs:
92
+ return pd.DataFrame(columns=["entity", "coupled", "degree", "average-revs"])
93
+
94
+ # 3. Count co-occurrences
95
+ pair_counts = Counter(pairs)
96
+
97
+ # 4. Count revisions per module
98
+ module_revs = (
99
+ df_filtered.groupby(GitLogSchema.ENTITY, observed=True)[GitLogSchema.REV]
100
+ .nunique()
101
+ .to_dict()
102
+ )
103
+
104
+ # 5. Calculate coupling degree and apply thresholds
105
+ results = []
106
+ for (entity1, entity2), shared_revs in pair_counts.items():
107
+ entity1_revs = module_revs[entity1]
108
+ entity2_revs = module_revs[entity2]
109
+ avg_revs = (entity1_revs + entity2_revs) / 2.0
110
+ coupling = (shared_revs / avg_revs) * 100.0
111
+
112
+ # 6. Apply all thresholds
113
+ if (
114
+ avg_revs >= min_revs
115
+ and shared_revs >= min_shared_revs
116
+ and min_coupling <= coupling <= max_coupling
117
+ ):
118
+ results.append(
119
+ {
120
+ "entity": entity1,
121
+ "coupled": entity2,
122
+ "degree": int(coupling),
123
+ "average-revs": int(avg_revs),
124
+ }
125
+ )
126
+
127
+ if not results:
128
+ return pd.DataFrame(columns=["entity", "coupled", "degree", "average-revs"])
129
+
130
+ # Sort by degree descending, then by entity and coupled names
131
+ result_df = pd.DataFrame(results)
132
+ result_df = result_df.sort_values(
133
+ ["degree", "entity", "coupled"], ascending=[False, True, True]
134
+ )
135
+
136
+ return result_df.reset_index(drop=True)
@@ -0,0 +1,210 @@
1
+ """Effort analysis for code-maat-python.
2
+
3
+ Analyzes author contributions by revision count and calculates
4
+ code fragmentation metrics.
5
+ """
6
+
7
+ import pandas as pd
8
+
9
+ from code_maat_python.parser import GitLogSchema
10
+ from code_maat_python.utils.math import fractal_value
11
+
12
+
13
+ def entity_effort(df: pd.DataFrame) -> pd.DataFrame:
14
+ """Calculate author contribution to each entity by revision count.
15
+
16
+ Identifies how many revisions each author contributed to each entity,
17
+ providing a measure of effort that works for all VCS systems.
18
+
19
+ Args:
20
+ df: DataFrame with columns: entity, rev, author, date, loc_added, loc_deleted
21
+
22
+ Returns:
23
+ DataFrame with columns:
24
+ - entity: File or module path
25
+ - author: Author name
26
+ - author-revs: Number of revisions by author for this entity
27
+ - total-revs: Total revisions for this entity
28
+
29
+ Example:
30
+ >>> data = [
31
+ ... {'entity': 'main.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01'},
32
+ ... {'entity': 'main.py', 'rev': '2', 'author': 'Alice', 'date': '2023-01-02'},
33
+ ... {'entity': 'main.py', 'rev': '3', 'author': 'Bob', 'date': '2023-01-03'},
34
+ ... ]
35
+ >>> df = pd.DataFrame(data)
36
+ >>> result = entity_effort(df)
37
+ """
38
+ # Handle empty DataFrame
39
+ if df.empty:
40
+ return pd.DataFrame(columns=["entity", "author", "author-revs", "total-revs"])
41
+
42
+ # Count revisions per entity-author combination
43
+ entity_author_revs = (
44
+ df.groupby([GitLogSchema.ENTITY, GitLogSchema.AUTHOR], observed=True)[GitLogSchema.REV]
45
+ .nunique()
46
+ .reset_index()
47
+ .rename(columns={GitLogSchema.REV: "author-revs"})
48
+ )
49
+
50
+ # Count total revisions per entity
51
+ entity_total_revs = (
52
+ df.groupby(GitLogSchema.ENTITY, observed=True)[GitLogSchema.REV]
53
+ .nunique()
54
+ .reset_index()
55
+ .rename(columns={GitLogSchema.REV: "total-revs"})
56
+ )
57
+
58
+ # Merge to get complete picture
59
+ result = entity_author_revs.merge(entity_total_revs, on=GitLogSchema.ENTITY)
60
+
61
+ # Sort by entity ascending, then by author-revs descending (stable sort)
62
+ # This matches the Clojure implementation which uses stable sort
63
+ result = result.sort_values(by=["author-revs"], ascending=False) # First sort by revs
64
+ result = result.sort_values(
65
+ by=[GitLogSchema.ENTITY], ascending=True, kind="stable"
66
+ ) # Then stable sort by entity
67
+
68
+ return result.reset_index(drop=True)
69
+
70
+
71
+ def main_dev_by_revs(df: pd.DataFrame) -> pd.DataFrame:
72
+ """Identify the main developer of each entity by revision count.
73
+
74
+ The main developer is the author who has contributed the most revisions
75
+ to each entity. Returns ownership percentage based on revision count.
76
+
77
+ Args:
78
+ df: DataFrame with columns: entity, rev, author, date, loc_added, loc_deleted
79
+
80
+ Returns:
81
+ DataFrame with columns:
82
+ - entity: File or module path
83
+ - main-dev: Primary developer name
84
+ - added: Number of revisions by main developer
85
+ - total-added: Total revisions for entity
86
+ - ownership: Ownership percentage (0-100)
87
+
88
+ Example:
89
+ >>> data = [
90
+ ... {'entity': 'main.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01'},
91
+ ... {'entity': 'main.py', 'rev': '2', 'author': 'Alice', 'date': '2023-01-02'},
92
+ ... {'entity': 'main.py', 'rev': '3', 'author': 'Bob', 'date': '2023-01-03'},
93
+ ... ]
94
+ >>> df = pd.DataFrame(data)
95
+ >>> result = main_dev_by_revs(df)
96
+ """
97
+ # Handle empty DataFrame
98
+ if df.empty:
99
+ return pd.DataFrame(columns=["entity", "main-dev", "added", "total-added", "ownership"])
100
+
101
+ # Count revisions per entity-author combination
102
+ entity_author_revs = (
103
+ df.groupby([GitLogSchema.ENTITY, GitLogSchema.AUTHOR], observed=True)[GitLogSchema.REV]
104
+ .nunique()
105
+ .reset_index()
106
+ .rename(columns={GitLogSchema.REV: "author-revs"})
107
+ )
108
+
109
+ # Count total revisions per entity
110
+ entity_total_revs = (
111
+ df.groupby(GitLogSchema.ENTITY, observed=True)[GitLogSchema.REV]
112
+ .nunique()
113
+ .reset_index()
114
+ .rename(columns={GitLogSchema.REV: "total-revs"})
115
+ )
116
+
117
+ # Find the author with max revisions per entity
118
+ main_devs = (
119
+ entity_author_revs.loc[
120
+ entity_author_revs.groupby(GitLogSchema.ENTITY, observed=True)["author-revs"].idxmax()
121
+ ]
122
+ .rename(
123
+ columns={
124
+ GitLogSchema.AUTHOR: "main-dev",
125
+ "author-revs": "added",
126
+ }
127
+ )
128
+ .reset_index(drop=True)
129
+ )
130
+
131
+ # Merge with total revisions
132
+ result = main_devs.merge(entity_total_revs, on=GitLogSchema.ENTITY)
133
+ result = result.rename(columns={"total-revs": "total-added"})
134
+
135
+ # Calculate ownership percentage
136
+ result["ownership"] = (result["added"] / result["total-added"] * 100).round(2)
137
+
138
+ # Sort by entity ascending
139
+ result = result.sort_values(by=[GitLogSchema.ENTITY], ascending=True)
140
+
141
+ return result.reset_index(drop=True)
142
+
143
+
144
+ def fragmentation(df: pd.DataFrame) -> pd.DataFrame:
145
+ """Calculate fragmentation for each entity using fractal value.
146
+
147
+ The fractal value measures how fragmented contributions are across
148
+ authors. Formula: FV = 1 - Σ(ai/nc)²
149
+
150
+ Args:
151
+ df: DataFrame with columns: entity, rev, author, date, loc_added, loc_deleted
152
+
153
+ Returns:
154
+ DataFrame with columns:
155
+ - entity: File or module path
156
+ - fractal-value: Fragmentation metric (0 to <1)
157
+ - 0: Single author (no fragmentation)
158
+ - →1: Highly fragmented across many authors
159
+ - total-revs: Total revisions for entity
160
+
161
+ Example:
162
+ >>> data = [
163
+ ... {'entity': 'main.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01'},
164
+ ... {'entity': 'main.py', 'rev': '2', 'author': 'Alice', 'date': '2023-01-02'},
165
+ ... {'entity': 'utils.py', 'rev': '3', 'author': 'Alice', 'date': '2023-01-03'},
166
+ ... {'entity': 'utils.py', 'rev': '4', 'author': 'Bob', 'date': '2023-01-04'},
167
+ ... ]
168
+ >>> df = pd.DataFrame(data)
169
+ >>> result = fragmentation(df)
170
+ """
171
+ # Handle empty DataFrame
172
+ if df.empty:
173
+ return pd.DataFrame(columns=["entity", "fractal-value", "total-revs"])
174
+
175
+ # Count revisions per entity-author combination
176
+ entity_author_revs = (
177
+ df.groupby([GitLogSchema.ENTITY, GitLogSchema.AUTHOR], observed=True)[GitLogSchema.REV]
178
+ .nunique()
179
+ .reset_index()
180
+ .rename(columns={GitLogSchema.REV: "author-revs"})
181
+ )
182
+
183
+ # Count total revisions per entity
184
+ entity_total_revs = (
185
+ df.groupby(GitLogSchema.ENTITY, observed=True)[GitLogSchema.REV].nunique().to_dict()
186
+ )
187
+
188
+ # Calculate fractal value for each entity
189
+ results = []
190
+ for entity, group in entity_author_revs.groupby(GitLogSchema.ENTITY, observed=True):
191
+ contributions = group["author-revs"].tolist()
192
+ total_revs = entity_total_revs[entity]
193
+
194
+ # Use the fractal_value utility function
195
+ fv = fractal_value(contributions)
196
+
197
+ results.append(
198
+ {
199
+ "entity": entity,
200
+ "fractal-value": round(fv, 2),
201
+ "total-revs": total_revs,
202
+ }
203
+ )
204
+
205
+ result = pd.DataFrame(results)
206
+
207
+ # Sort by fractal-value descending, then by total-revs descending
208
+ result = result.sort_values(by=["fractal-value", "total-revs"], ascending=[False, False])
209
+
210
+ return result.reset_index(drop=True)
@@ -0,0 +1,51 @@
1
+ """Entity listing analysis for code-maat-python.
2
+
3
+ Lists all entities with basic statistics.
4
+ """
5
+
6
+ import pandas as pd
7
+
8
+ from code_maat_python.parser import GitLogSchema
9
+
10
+
11
+ def analyze_entities(df: pd.DataFrame) -> pd.DataFrame:
12
+ """List all entities with statistics.
13
+
14
+ Provides a simple listing of all entities (files) in the repository
15
+ with the number of times they appear in commits.
16
+
17
+ Args:
18
+ df: DataFrame with columns: entity, rev, author, date, loc_added, loc_deleted
19
+
20
+ Returns:
21
+ DataFrame with columns:
22
+ - entity: File or module path
23
+ - n-commits: Number of commit records (entity appearances)
24
+ Sorted by entity name.
25
+
26
+ Example:
27
+ >>> data = [
28
+ ... {'entity': 'src/main.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01'},
29
+ ... {'entity': 'src/main.py', 'rev': '2', 'author': 'Bob', 'date': '2023-01-02'},
30
+ ... {'entity': 'src/utils.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01'},
31
+ ... ]
32
+ >>> df = pd.DataFrame(data)
33
+ >>> result = analyze_entities(df)
34
+ >>> print(result)
35
+ entity n-commits
36
+ 0 src/main.py 2
37
+ 1 src/utils.py 1
38
+ """
39
+ # Handle empty DataFrame
40
+ if df.empty:
41
+ return pd.DataFrame(columns=["entity", "n-commits"])
42
+
43
+ # Count occurrences of each entity
44
+ result = (
45
+ df.groupby(GitLogSchema.ENTITY, observed=True)
46
+ .size()
47
+ .reset_index(name="n-commits")
48
+ .sort_values("entity")
49
+ )
50
+
51
+ return result.reset_index(drop=True)
@@ -0,0 +1,56 @@
1
+ """Revision frequency analysis for code-maat-python.
2
+
3
+ Analyzes and sorts entities by their revision frequency.
4
+ """
5
+
6
+ import pandas as pd
7
+
8
+ from code_maat_python.parser import GitLogSchema
9
+
10
+
11
+ def analyze_revisions(df: pd.DataFrame) -> pd.DataFrame:
12
+ """Sort entities by revision frequency.
13
+
14
+ Counts the number of times each entity has been revised and sorts
15
+ by frequency in descending order. This helps identify the most
16
+ frequently changed files in a codebase.
17
+
18
+ Args:
19
+ df: DataFrame with columns: entity, rev, author, date, loc_added, loc_deleted
20
+
21
+ Returns:
22
+ DataFrame with columns:
23
+ - entity: File or module path
24
+ - n-revs: Number of revisions
25
+ Sorted by n-revs descending, then by entity name.
26
+
27
+ Example:
28
+ >>> data = [
29
+ ... {'entity': 'src/main.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01'},
30
+ ... {'entity': 'src/main.py', 'rev': '2', 'author': 'Bob', 'date': '2023-01-02'},
31
+ ... {'entity': 'src/main.py', 'rev': '3', 'author': 'Alice', 'date': '2023-01-03'},
32
+ ... {'entity': 'src/utils.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01'},
33
+ ... ]
34
+ >>> df = pd.DataFrame(data)
35
+ >>> result = analyze_revisions(df)
36
+ >>> print(result)
37
+ entity n-revs
38
+ 0 src/main.py 3
39
+ 1 src/utils.py 1
40
+ """
41
+ # Handle empty DataFrame
42
+ if df.empty:
43
+ return pd.DataFrame(columns=["entity", "n-revs"])
44
+
45
+ # Count unique revisions per entity
46
+ result = (
47
+ df.groupby(GitLogSchema.ENTITY, observed=True)[GitLogSchema.REV]
48
+ .nunique()
49
+ .reset_index()
50
+ )
51
+ result.columns = ["entity", "n-revs"]
52
+
53
+ # Sort by revision count descending, then by entity name
54
+ result = result.sort_values(by=["n-revs", "entity"], ascending=[False, True])
55
+
56
+ return result.reset_index(drop=True)