code-maat-python 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_maat_python/__init__.py +12 -0
- code_maat_python/__main__.py +5 -0
- code_maat_python/analyses/__init__.py +39 -0
- code_maat_python/analyses/age.py +101 -0
- code_maat_python/analyses/authors.py +60 -0
- code_maat_python/analyses/churn.py +353 -0
- code_maat_python/analyses/communication.py +151 -0
- code_maat_python/analyses/coupling.py +136 -0
- code_maat_python/analyses/effort.py +210 -0
- code_maat_python/analyses/entities.py +51 -0
- code_maat_python/analyses/revisions.py +56 -0
- code_maat_python/analyses/soc.py +90 -0
- code_maat_python/analyses/summary.py +61 -0
- code_maat_python/cli.py +822 -0
- code_maat_python/output/__init__.py +0 -0
- code_maat_python/parser.py +232 -0
- code_maat_python/pipeline.py +112 -0
- code_maat_python/transformers/__init__.py +0 -0
- code_maat_python/transformers/grouper.py +204 -0
- code_maat_python/transformers/team_mapper.py +132 -0
- code_maat_python/transformers/time_grouper.py +146 -0
- code_maat_python/utils/__init__.py +0 -0
- code_maat_python/utils/math.py +105 -0
- code_maat_python-0.1.0.dist-info/METADATA +545 -0
- code_maat_python-0.1.0.dist-info/RECORD +28 -0
- code_maat_python-0.1.0.dist-info/WHEEL +4 -0
- code_maat_python-0.1.0.dist-info/entry_points.txt +3 -0
- code_maat_python-0.1.0.dist-info/licenses/LICENSE +674 -0
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""Sum of Coupling (SOC) analysis for code-maat-python.
|
|
2
|
+
|
|
3
|
+
Calculate sum of coupling for each entity based on commit size.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from code_maat_python.parser import GitLogSchema
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def analyze_soc(df: pd.DataFrame, max_changeset_size: int = 30) -> pd.DataFrame:
|
|
12
|
+
"""Calculate sum of coupling (SOC) for each entity.
|
|
13
|
+
|
|
14
|
+
Sum of Coupling provides a simpler metric than full logical coupling
|
|
15
|
+
analysis. For each commit with m files, each file gets a SOC score
|
|
16
|
+
of (m-1). This measures how often a file changes with other files,
|
|
17
|
+
without tracking which specific files it changes with.
|
|
18
|
+
|
|
19
|
+
Algorithm:
|
|
20
|
+
1. Filter out large commits (> max_changeset_size files)
|
|
21
|
+
2. For each commit, calculate commit size (number of files)
|
|
22
|
+
3. For each entity in a commit, add (commit_size - 1) to its SOC score
|
|
23
|
+
4. Sum all contributions per entity
|
|
24
|
+
|
|
25
|
+
This is simpler and faster than full coupling analysis, making it
|
|
26
|
+
useful for validating the data pipeline.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
df: DataFrame with columns: entity, rev, author, date, loc_added, loc_deleted
|
|
30
|
+
max_changeset_size: Maximum number of files in a commit to consider (default: 30)
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
DataFrame with columns:
|
|
34
|
+
- entity: File or module path
|
|
35
|
+
- soc: Sum of coupling score (integer)
|
|
36
|
+
Sorted by soc descending, then by entity name.
|
|
37
|
+
|
|
38
|
+
Example:
|
|
39
|
+
>>> data = [
|
|
40
|
+
... {'entity': 'src/main.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01'},
|
|
41
|
+
... {'entity': 'src/utils.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01'},
|
|
42
|
+
... {'entity': 'src/test.py', 'rev': '2', 'author': 'Bob', 'date': '2023-01-02'},
|
|
43
|
+
... ]
|
|
44
|
+
>>> df = pd.DataFrame(data)
|
|
45
|
+
>>> result = analyze_soc(df)
|
|
46
|
+
>>> print(result)
|
|
47
|
+
entity soc
|
|
48
|
+
0 src/main.py 1
|
|
49
|
+
1 src/utils.py 1
|
|
50
|
+
2 src/test.py 0
|
|
51
|
+
# main.py and utils.py were in a commit together (m=2), so each gets soc=1
|
|
52
|
+
# test.py was alone (m=1), so gets soc=0
|
|
53
|
+
"""
|
|
54
|
+
# Handle empty DataFrame
|
|
55
|
+
if df.empty:
|
|
56
|
+
return pd.DataFrame(columns=["entity", "soc"])
|
|
57
|
+
|
|
58
|
+
# Filter large commits BEFORE processing
|
|
59
|
+
commit_sizes = df.groupby(GitLogSchema.REV, observed=True)[GitLogSchema.ENTITY].count()
|
|
60
|
+
valid_revs = commit_sizes[commit_sizes <= max_changeset_size].index
|
|
61
|
+
df_filtered = df[df[GitLogSchema.REV].isin(valid_revs)].copy()
|
|
62
|
+
|
|
63
|
+
if df_filtered.empty:
|
|
64
|
+
return pd.DataFrame(columns=["entity", "soc"])
|
|
65
|
+
|
|
66
|
+
# Calculate commit sizes and map to each row
|
|
67
|
+
commit_size_map = (
|
|
68
|
+
df_filtered.groupby(GitLogSchema.REV, observed=True)[GitLogSchema.ENTITY]
|
|
69
|
+
.count()
|
|
70
|
+
.to_dict()
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# Calculate SOC contribution for each entity occurrence
|
|
74
|
+
# For each occurrence of entity in a commit, add (commit_size - 1)
|
|
75
|
+
df_filtered["commit_size"] = df_filtered[GitLogSchema.REV].map(commit_size_map).astype(int)
|
|
76
|
+
df_filtered["soc_contribution"] = df_filtered["commit_size"] - 1
|
|
77
|
+
|
|
78
|
+
# Sum contributions per entity
|
|
79
|
+
result = (
|
|
80
|
+
df_filtered.groupby(GitLogSchema.ENTITY, observed=True)["soc_contribution"]
|
|
81
|
+
.sum()
|
|
82
|
+
.reset_index()
|
|
83
|
+
.rename(columns={"soc_contribution": "soc"})
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# Convert to integer and sort
|
|
87
|
+
result["soc"] = result["soc"].astype(int)
|
|
88
|
+
result = result.sort_values(["soc", "entity"], ascending=[False, True])
|
|
89
|
+
|
|
90
|
+
return result.reset_index(drop=True)
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Summary statistics analysis for code-maat-python.
|
|
2
|
+
|
|
3
|
+
Provides overview statistics for a repository.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from code_maat_python.parser import GitLogSchema
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def analyze_summary(df: pd.DataFrame) -> pd.DataFrame:
|
|
12
|
+
"""Generate overview statistics.
|
|
13
|
+
|
|
14
|
+
Provides high-level statistics about the repository including
|
|
15
|
+
commit count, entity count, author count, and date range.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
df: DataFrame with columns: entity, rev, author, date, loc_added, loc_deleted
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
DataFrame with columns:
|
|
22
|
+
- statistic: Name of the statistic
|
|
23
|
+
- value: String representation of the value
|
|
24
|
+
Including: n-commits, n-entities, n-authors, first-commit, last-commit
|
|
25
|
+
|
|
26
|
+
Example:
|
|
27
|
+
>>> data = [
|
|
28
|
+
... {'entity': 'src/main.py', 'rev': 'abc', 'author': 'Dev1',
|
|
29
|
+
... 'date': pd.to_datetime('2023-01-01')},
|
|
30
|
+
... {'entity': 'src/utils.py', 'rev': 'def', 'author': 'Dev2',
|
|
31
|
+
... 'date': pd.to_datetime('2023-01-15')},
|
|
32
|
+
... ]
|
|
33
|
+
>>> df = pd.DataFrame(data)
|
|
34
|
+
>>> result = analyze_summary(df)
|
|
35
|
+
>>> print(result)
|
|
36
|
+
statistic value
|
|
37
|
+
0 n-commits 2
|
|
38
|
+
1 n-entities 2
|
|
39
|
+
2 n-authors 2
|
|
40
|
+
3 first-commit 2023-01-01
|
|
41
|
+
4 last-commit 2023-01-15
|
|
42
|
+
"""
|
|
43
|
+
# Handle empty DataFrame
|
|
44
|
+
if df.empty:
|
|
45
|
+
return pd.DataFrame(columns=["statistic", "value"])
|
|
46
|
+
|
|
47
|
+
# Calculate statistics
|
|
48
|
+
stats = {
|
|
49
|
+
"n-commits": df[GitLogSchema.REV].nunique(),
|
|
50
|
+
"n-entities": df[GitLogSchema.ENTITY].nunique(),
|
|
51
|
+
"n-authors": df[GitLogSchema.AUTHOR].nunique(),
|
|
52
|
+
"first-commit": df[GitLogSchema.DATE].min().strftime("%Y-%m-%d"),
|
|
53
|
+
"last-commit": df[GitLogSchema.DATE].max().strftime("%Y-%m-%d"),
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
# Convert to DataFrame with specific order
|
|
57
|
+
result = pd.DataFrame(
|
|
58
|
+
[{"statistic": k, "value": str(v)} for k, v in stats.items()]
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
return result
|