code-maat-python 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,90 @@
1
+ """Sum of Coupling (SOC) analysis for code-maat-python.
2
+
3
+ Calculate sum of coupling for each entity based on commit size.
4
+ """
5
+
6
+ import pandas as pd
7
+
8
+ from code_maat_python.parser import GitLogSchema
9
+
10
+
11
+ def analyze_soc(df: pd.DataFrame, max_changeset_size: int = 30) -> pd.DataFrame:
12
+ """Calculate sum of coupling (SOC) for each entity.
13
+
14
+ Sum of Coupling provides a simpler metric than full logical coupling
15
+ analysis. For each commit with m files, each file gets a SOC score
16
+ of (m-1). This measures how often a file changes with other files,
17
+ without tracking which specific files it changes with.
18
+
19
+ Algorithm:
20
+ 1. Filter out large commits (> max_changeset_size files)
21
+ 2. For each commit, calculate commit size (number of files)
22
+ 3. For each entity in a commit, add (commit_size - 1) to its SOC score
23
+ 4. Sum all contributions per entity
24
+
25
+ This is simpler and faster than full coupling analysis, making it
26
+ useful for validating the data pipeline.
27
+
28
+ Args:
29
+ df: DataFrame with columns: entity, rev, author, date, loc_added, loc_deleted
30
+ max_changeset_size: Maximum number of files in a commit to consider (default: 30)
31
+
32
+ Returns:
33
+ DataFrame with columns:
34
+ - entity: File or module path
35
+ - soc: Sum of coupling score (integer)
36
+ Sorted by soc descending, then by entity name.
37
+
38
+ Example:
39
+ >>> data = [
40
+ ... {'entity': 'src/main.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01'},
41
+ ... {'entity': 'src/utils.py', 'rev': '1', 'author': 'Alice', 'date': '2023-01-01'},
42
+ ... {'entity': 'src/test.py', 'rev': '2', 'author': 'Bob', 'date': '2023-01-02'},
43
+ ... ]
44
+ >>> df = pd.DataFrame(data)
45
+ >>> result = analyze_soc(df)
46
+ >>> print(result)
47
+ entity soc
48
+ 0 src/main.py 1
49
+ 1 src/utils.py 1
50
+ 2 src/test.py 0
51
+ # main.py and utils.py were in a commit together (m=2), so each gets soc=1
52
+ # test.py was alone (m=1), so gets soc=0
53
+ """
54
+ # Handle empty DataFrame
55
+ if df.empty:
56
+ return pd.DataFrame(columns=["entity", "soc"])
57
+
58
+ # Filter large commits BEFORE processing
59
+ commit_sizes = df.groupby(GitLogSchema.REV, observed=True)[GitLogSchema.ENTITY].count()
60
+ valid_revs = commit_sizes[commit_sizes <= max_changeset_size].index
61
+ df_filtered = df[df[GitLogSchema.REV].isin(valid_revs)].copy()
62
+
63
+ if df_filtered.empty:
64
+ return pd.DataFrame(columns=["entity", "soc"])
65
+
66
+ # Calculate commit sizes and map to each row
67
+ commit_size_map = (
68
+ df_filtered.groupby(GitLogSchema.REV, observed=True)[GitLogSchema.ENTITY]
69
+ .count()
70
+ .to_dict()
71
+ )
72
+
73
+ # Calculate SOC contribution for each entity occurrence
74
+ # For each occurrence of entity in a commit, add (commit_size - 1)
75
+ df_filtered["commit_size"] = df_filtered[GitLogSchema.REV].map(commit_size_map).astype(int)
76
+ df_filtered["soc_contribution"] = df_filtered["commit_size"] - 1
77
+
78
+ # Sum contributions per entity
79
+ result = (
80
+ df_filtered.groupby(GitLogSchema.ENTITY, observed=True)["soc_contribution"]
81
+ .sum()
82
+ .reset_index()
83
+ .rename(columns={"soc_contribution": "soc"})
84
+ )
85
+
86
+ # Convert to integer and sort
87
+ result["soc"] = result["soc"].astype(int)
88
+ result = result.sort_values(["soc", "entity"], ascending=[False, True])
89
+
90
+ return result.reset_index(drop=True)
@@ -0,0 +1,61 @@
1
+ """Summary statistics analysis for code-maat-python.
2
+
3
+ Provides overview statistics for a repository.
4
+ """
5
+
6
+ import pandas as pd
7
+
8
+ from code_maat_python.parser import GitLogSchema
9
+
10
+
11
+ def analyze_summary(df: pd.DataFrame) -> pd.DataFrame:
12
+ """Generate overview statistics.
13
+
14
+ Provides high-level statistics about the repository including
15
+ commit count, entity count, author count, and date range.
16
+
17
+ Args:
18
+ df: DataFrame with columns: entity, rev, author, date, loc_added, loc_deleted
19
+
20
+ Returns:
21
+ DataFrame with columns:
22
+ - statistic: Name of the statistic
23
+ - value: String representation of the value
24
+ Including: n-commits, n-entities, n-authors, first-commit, last-commit
25
+
26
+ Example:
27
+ >>> data = [
28
+ ... {'entity': 'src/main.py', 'rev': 'abc', 'author': 'Dev1',
29
+ ... 'date': pd.to_datetime('2023-01-01')},
30
+ ... {'entity': 'src/utils.py', 'rev': 'def', 'author': 'Dev2',
31
+ ... 'date': pd.to_datetime('2023-01-15')},
32
+ ... ]
33
+ >>> df = pd.DataFrame(data)
34
+ >>> result = analyze_summary(df)
35
+ >>> print(result)
36
+ statistic value
37
+ 0 n-commits 2
38
+ 1 n-entities 2
39
+ 2 n-authors 2
40
+ 3 first-commit 2023-01-01
41
+ 4 last-commit 2023-01-15
42
+ """
43
+ # Handle empty DataFrame
44
+ if df.empty:
45
+ return pd.DataFrame(columns=["statistic", "value"])
46
+
47
+ # Calculate statistics
48
+ stats = {
49
+ "n-commits": df[GitLogSchema.REV].nunique(),
50
+ "n-entities": df[GitLogSchema.ENTITY].nunique(),
51
+ "n-authors": df[GitLogSchema.AUTHOR].nunique(),
52
+ "first-commit": df[GitLogSchema.DATE].min().strftime("%Y-%m-%d"),
53
+ "last-commit": df[GitLogSchema.DATE].max().strftime("%Y-%m-%d"),
54
+ }
55
+
56
+ # Convert to DataFrame with specific order
57
+ result = pd.DataFrame(
58
+ [{"statistic": k, "value": str(v)} for k, v in stats.items()]
59
+ )
60
+
61
+ return result