better-git-of-theseus 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
git_of_theseus/app.py ADDED
@@ -0,0 +1,133 @@
1
+ import streamlit as st
2
+ import os
3
+ import tempfile
4
+ import shutil
5
+ try:
6
+ from git_of_theseus.analyze import analyze
7
+ from git_of_theseus.plotly_plots import plotly_stack_plot, plotly_line_plot, plotly_survival_plot
8
+ except ImportError:
9
+ from analyze import analyze
10
+ from plotly_plots import plotly_stack_plot, plotly_line_plot, plotly_survival_plot
11
+
12
+ st.set_page_config(page_title="Git of Theseus Dash", layout="wide")
13
+
14
+ st.title("📊 Git of Theseus - Repository Analysis")
15
+
16
+ import sys
17
+
18
+ # Sidebar Configuration
19
+ st.sidebar.header("Configuration")
20
+
21
+ default_repo = "."
22
+ if len(sys.argv) > 1:
23
+ default_repo = sys.argv[1]
24
+
25
+ repo_path = st.sidebar.text_input("Git Repository Path", value=default_repo)
26
+ branch = st.sidebar.text_input("Branch", value="master")
27
+
28
+ with st.sidebar.expander("Analysis Parameters"):
29
+ cohortfm = st.text_input(
30
+ "Cohort Format",
31
+ value="%Y",
32
+ help="Python strftime format string. Common options:\n\n"
33
+ "- `%Y`: Year (e.g., 2023)\n"
34
+ "- `%Y-%m`: Month (e.g., 2023-01)\n"
35
+ "- `%Y-W%W`: Week (e.g., 2023-W01)\n"
36
+ "- `%Y-%m-%d`: Day"
37
+ )
38
+ interval = st.number_input("Interval (seconds)", value=7 * 24 * 60 * 60)
39
+ procs = st.number_input("Processes", value=2, min_value=1)
40
+ ignore = st.text_area("Ignore (comma separated)").split(",")
41
+ ignore = [i.strip() for i in ignore if i.strip()]
42
+
43
+ @st.cache_data(show_spinner=False)
44
+ def run_analysis(repo_path, branch, cohortfm, interval, procs, ignore):
45
+ return analyze(
46
+ repo_path,
47
+ cohortfm=cohortfm,
48
+ interval=interval,
49
+ ignore=ignore,
50
+ outdir=None,
51
+ branch=branch,
52
+ procs=procs,
53
+ quiet=True
54
+ )
55
+
56
+ # State management for analysis results
57
+ if 'analysis_results' not in st.session_state:
58
+ st.session_state.analysis_results = None
59
+
60
+ if st.sidebar.button("🚀 Run Analysis") or (len(sys.argv) > 1 and st.session_state.analysis_results is None):
61
+ with st.spinner("Analyzing repository... this may take a while."):
62
+ try:
63
+ st.session_state.analysis_results = run_analysis(
64
+ repo_path, branch, cohortfm, interval, procs, ignore
65
+ )
66
+ st.success("Analysis completed!")
67
+ except Exception as e:
68
+ st.error(f"Analysis failed: {e}")
69
+ st.session_state.analysis_results = None
70
+
71
+ # Main View
72
+ if st.session_state.analysis_results:
73
+ results = st.session_state.analysis_results
74
+ tab1, tab2, tab3 = st.tabs(["Stack Plot", "Line Plot", "Survival Plot"])
75
+
76
+ with tab1:
77
+ st.header("Stack Plot")
78
+ col1, col2 = st.columns([1, 3])
79
+ with col1:
80
+ source_map = {
81
+ "Cohorts": "cohorts",
82
+ "Authors": "authors",
83
+ "Extensions": "exts",
84
+ "Directories": "dirs",
85
+ "Domains": "domains"
86
+ }
87
+ data_source_label = st.selectbox("Data Source", list(source_map.keys()), key="stack_source")
88
+ data_key = source_map[data_source_label]
89
+ normalize = st.checkbox("Normalize to 100%", value=False, key="stack_norm")
90
+ max_n = st.slider("Max Series", 5, 50, 20, key="stack_max_n")
91
+ with col2:
92
+ project_name = os.path.basename(os.path.abspath(repo_path))
93
+ data = results.get(data_key)
94
+ if data:
95
+ fig = plotly_stack_plot(data, normalize=normalize, max_n=max_n, title=project_name)
96
+ st.plotly_chart(fig, use_container_width=True)
97
+ else:
98
+ st.warning(f"Data for {data_source_label} not found.")
99
+
100
+ with tab2:
101
+ st.header("Line Plot")
102
+ col1, col2 = st.columns([1, 3])
103
+ with col1:
104
+ data_source_label_line = st.selectbox("Data Source", list(source_map.keys()), key="line_source")
105
+ data_key_line = source_map[data_source_label_line]
106
+ normalize_line = st.checkbox("Normalize to 100%", value=False, key="line_norm")
107
+ max_n_line = st.slider("Max Series", 5, 50, 20, key="line_max_n")
108
+ with col2:
109
+ project_name = os.path.basename(os.path.abspath(repo_path))
110
+ data_line = results.get(data_key_line)
111
+ if data_line:
112
+ fig = plotly_line_plot(data_line, normalize=normalize_line, max_n=max_n_line, title=project_name)
113
+ st.plotly_chart(fig, use_container_width=True)
114
+ else:
115
+ st.warning(f"Data for {data_source_label_line} not found.")
116
+
117
+ with tab3:
118
+ st.header("Survival Plot")
119
+ col1, col2 = st.columns([1, 3])
120
+ with col1:
121
+ exp_fit = st.checkbox("Exponential Fit", value=False)
122
+ years = st.slider("Years", 1, 20, 5)
123
+ with col2:
124
+ project_name = os.path.basename(os.path.abspath(repo_path))
125
+ survival_data = results.get("survival")
126
+ if survival_data:
127
+ fig = plotly_survival_plot(survival_data, exp_fit=exp_fit, years=years, title=project_name)
128
+ st.plotly_chart(fig, use_container_width=True)
129
+ else:
130
+ st.warning("Survival data not found.")
131
+
132
+ else:
133
+ st.info("👈 Enter a repository path and click 'Run Analysis' to get started.")
git_of_theseus/cmd.py ADDED
@@ -0,0 +1,23 @@
1
+ import sys
2
+ import os
3
+ import subprocess
4
+
5
+ def main():
6
+ # Get the directory of the current file
7
+ cmd_dir = os.path.dirname(os.path.abspath(__file__))
8
+ app_path = os.path.join(cmd_dir, "app.py")
9
+
10
+ # The first argument is the repo path, default to current directory
11
+ repo_path = sys.argv[1] if len(sys.argv) > 1 else os.getcwd()
12
+ repo_path = os.path.abspath(repo_path)
13
+
14
+ # Run streamlit
15
+ # We pass the repo_path as an argument to the streamlit script
16
+ subprocess.run([
17
+ sys.executable, "-m", "streamlit", "run",
18
+ app_path,
19
+ "--", repo_path
20
+ ])
21
+
22
+ if __name__ == "__main__":
23
+ main()
@@ -0,0 +1,102 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # Copyright 2016 Erik Bernhardsson
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import matplotlib
18
+
19
+ matplotlib.use("Agg")
20
+
21
+ import argparse, dateutil.parser, json, numpy, sys
22
+ from matplotlib import pyplot
23
+
24
+
25
+ from .utils import generate_n_colors
26
+
27
+
28
+ def line_plot(
29
+ input_fn_or_data, display=False, outfile=None, max_n=20, normalize=False, title=None
30
+ ):
31
+ if isinstance(input_fn_or_data, str):
32
+ data = json.load(open(input_fn_or_data))
33
+ else:
34
+ data = input_fn_or_data
35
+ y = numpy.array(data["y"])
36
+ y_sums = numpy.sum(y, axis=0)
37
+ if y.shape[0] > max_n:
38
+ js = sorted(range(len(data["labels"])), key=lambda j: max(y[j]), reverse=True)
39
+ top_js = sorted(js[:max_n], key=lambda j: data["labels"][j])
40
+ y = numpy.array([y[j] for j in top_js])
41
+ labels = [data["labels"][j] for j in top_js]
42
+ else:
43
+ labels = data["labels"]
44
+ if normalize:
45
+ y = 100.0 * y / y_sums
46
+ fig = pyplot.figure(figsize=(16, 12), dpi=120)
47
+ pyplot.style.use("ggplot")
48
+ ts = [dateutil.parser.parse(t) for t in data["ts"]]
49
+ colors = generate_n_colors(len(labels))
50
+ for color, label, series in zip(colors, labels, y):
51
+ pyplot.plot(ts, series, color=color, label=label, linewidth=3)
52
+ pyplot.legend(loc=2)
53
+ if normalize:
54
+ pyplot.ylabel("Share of lines of code (%)")
55
+ pyplot.ylim([0, 100])
56
+ else:
57
+ pyplot.ylabel("Lines of code")
58
+
59
+ if title:
60
+ pyplot.text(0.5, 0.5, title, transform=pyplot.gca().transAxes,
61
+ fontsize=40, color='gray', alpha=0.3,
62
+ ha='center', va='center', rotation=30)
63
+
64
+ if outfile:
65
+ print("Writing output to %s" % outfile)
66
+ pyplot.savefig(outfile)
67
+
68
+ pyplot.tight_layout()
69
+ if display:
70
+ pyplot.show()
71
+
72
+ return fig
73
+
74
+
75
+ def line_plot_cmdline():
76
+ parser = argparse.ArgumentParser(description="Plot line plot")
77
+ parser.add_argument("--display", action="store_true", help="Display plot")
78
+ parser.add_argument(
79
+ "--outfile",
80
+ default="line_plot.png",
81
+ type=str,
82
+ help="Output file to store results (default: %(default)s)",
83
+ )
84
+ parser.add_argument(
85
+ "--max-n",
86
+ default=20,
87
+ type=int,
88
+ help="Max number of dataseries (default: %(default)s)",
89
+ )
90
+ parser.add_argument(
91
+ "--normalize",
92
+ action="store_true",
93
+ help="Plot the share of each, so it adds up to 100%%",
94
+ )
95
+ parser.add_argument("input_fn")
96
+ kwargs = vars(parser.parse_args())
97
+
98
+ line_plot(**kwargs)
99
+
100
+
101
+ if __name__ == "__main__":
102
+ line_plot_cmdline()
@@ -0,0 +1,243 @@
1
+ import plotly.graph_objects as go
2
+ import plotly.express as px
3
+ import numpy as np
4
+ import dateutil.parser
5
+ import collections
6
+ import math
7
+ import os
8
+ from .utils import generate_n_colors
9
+
10
+ def _process_stack_line_data(data, max_n=20, normalize=False):
11
+ # Handle dict or file path
12
+ # If it's a file path, load it? But app.py passes dict now.
13
+ # Let's assume dict for now as per app.py refactor.
14
+ if not isinstance(data, dict):
15
+ # Fallback if needed, though app.py sends dict
16
+ import json
17
+ data = json.load(open(data))
18
+
19
+ y = np.array(data["y"])
20
+ labels = data["labels"]
21
+ ts = [dateutil.parser.parse(t) for t in data["ts"]]
22
+
23
+ # Sort and filter top N
24
+ if y.shape[0] > max_n:
25
+ # Sort by max value in the series
26
+ js = sorted(range(len(labels)), key=lambda j: max(y[j]), reverse=True)
27
+
28
+ # Calculate other sum
29
+ other_indices = js[max_n:]
30
+ if other_indices:
31
+ other_sum = np.sum([y[j] for j in other_indices], axis=0)
32
+
33
+ # Top N indices
34
+ top_js = sorted(js[:max_n], key=lambda j: labels[j])
35
+
36
+ y = np.array([y[j] for j in top_js] + [other_sum])
37
+ labels = [labels[j] for j in top_js] + ["other"]
38
+ else:
39
+ # Should hopefully not happen if shape[0] > max_n
40
+ pass
41
+ else:
42
+ # Sort alphabetically for consistency
43
+ js = range(len(labels))
44
+ # strictly speaking existing code didn't sort if <= max_n?
45
+ # "labels = data['labels']" in existing code.
46
+ pass
47
+
48
+ y_sums = np.sum(y, axis=0)
49
+
50
+ # Avoid division by zero
51
+ y_sums[y_sums == 0] = 1.0
52
+
53
+ if normalize:
54
+ y = 100.0 * y / y_sums
55
+
56
+ return ts, y, labels
57
+
58
+ def plotly_stack_plot(data, max_n=20, normalize=False, title=None):
59
+ ts, y, labels = _process_stack_line_data(data, max_n, normalize)
60
+
61
+ fig = go.Figure()
62
+
63
+ # Use a nice color palette
64
+ colors = px.colors.qualitative.Plotly
65
+ if len(labels) > len(colors):
66
+ colors = px.colors.qualitative.Dark24 # More colors if needed
67
+
68
+ for i, label in enumerate(labels):
69
+ color = colors[i % len(colors)]
70
+ fig.add_trace(go.Scatter(
71
+ x=ts,
72
+ y=y[i],
73
+ mode='lines',
74
+ name=label,
75
+ stackgroup='one', # This enables stacking
76
+ line=dict(width=0.5, color=color),
77
+ fillcolor=color # Optional: specific fill color
78
+ ))
79
+
80
+ fig.update_layout(
81
+ title=dict(text=title, x=0.5) if title else None,
82
+ yaxis=dict(
83
+ title="Share of lines of code (%)" if normalize else "Lines of code",
84
+ range=[0, 100] if normalize else None
85
+ ),
86
+ xaxis=dict(title="Date"),
87
+ hovermode="x unified",
88
+ margin=dict(l=20, r=20, t=50, b=20),
89
+ )
90
+
91
+
92
+ return fig
93
+
94
+ def plotly_line_plot(data, max_n=20, normalize=False, title=None):
95
+ ts, y, labels = _process_stack_line_data(data, max_n, normalize)
96
+
97
+ fig = go.Figure()
98
+
99
+ for i, label in enumerate(labels):
100
+ fig.add_trace(go.Scatter(
101
+ x=ts,
102
+ y=y[i],
103
+ mode='lines',
104
+ name=label,
105
+ line=dict(width=2)
106
+ ))
107
+
108
+ fig.update_layout(
109
+ title=dict(text=title, x=0.5) if title else None,
110
+ yaxis=dict(
111
+ title="Share of lines of code (%)" if normalize else "Lines of code",
112
+ range=[0, 100] if normalize else None
113
+ ),
114
+ xaxis=dict(title="Date"),
115
+ hovermode="x unified",
116
+ margin=dict(l=20, r=20, t=50, b=20),
117
+ )
118
+
119
+
120
+ return fig
121
+
122
+ def plotly_survival_plot(commit_history, exp_fit=False, years=5, title=None):
123
+ # Logic copied from survival_plot.py
124
+ # commit_history is {sha: [[ts, count], ...]}
125
+
126
+ deltas = collections.defaultdict(lambda: np.zeros(2))
127
+ total_n = 0
128
+ YEAR = 365.25 * 24 * 60 * 60
129
+
130
+ # Process history
131
+ # Input might be a list of histories if we support multiple inputs,
132
+ # but based on app.py we pass a single result["survival"] dict.
133
+ # However, existing survival_plot took a LIST of filenames.
134
+ # Let's support the single dict passed from app.py.
135
+
136
+ # The logic in survival_plot.py iterates over input_fns, loads them, and computes `all_deltas`.
137
+ # Here we assume `commit_history` IS the content of one such file (the dict).
138
+
139
+ for commit, history in commit_history.items():
140
+ t0, orig_count = history[0]
141
+ total_n += orig_count
142
+ last_count = orig_count
143
+ for t, count in history[1:]:
144
+ deltas[t - t0] += (count - last_count, 0)
145
+ last_count = count
146
+ deltas[history[-1][0] - t0] += (-last_count, -orig_count)
147
+
148
+ # Calculate curve
149
+ P = 1.0
150
+ xs = []
151
+ ys = []
152
+
153
+ # Sort deltas by time
154
+ sorted_times = sorted(deltas.keys())
155
+
156
+ total_k = total_n # unused?
157
+
158
+ for t in sorted_times:
159
+ delta_k, delta_n = deltas[t]
160
+ xs.append(t / YEAR)
161
+ ys.append(100.0 * P)
162
+
163
+ if total_n > 0:
164
+ P *= 1 + delta_k / total_n
165
+
166
+ # total_k += delta_k
167
+ total_n += delta_n
168
+
169
+ if P < 0.05:
170
+ break
171
+
172
+ fig = go.Figure()
173
+
174
+ # Main survival curve
175
+ fig.add_trace(go.Scatter(
176
+ x=xs, y=ys,
177
+ mode='lines',
178
+ name='Survival Rate',
179
+ line=dict(color='blue')
180
+ ))
181
+
182
+ # Exponential fit
183
+ if exp_fit:
184
+ try:
185
+ import scipy.optimize
186
+
187
+ # Define loss function for fit
188
+ def fit(k):
189
+ loss = 0.0
190
+ # Re-calculate P stream to fit k
191
+ # Need to iterate again or reuse data?
192
+ # The original code re-iterates.
193
+
194
+ # Simplified for single dataset:
195
+ curr_total_n = 0
196
+ for _, history in commit_history.items():
197
+ curr_total_n += history[0][1]
198
+
199
+ P_fit = 1.0
200
+ curr_total_n_fit = curr_total_n
201
+
202
+ for t in sorted_times:
203
+ delta_k, delta_n = deltas[t]
204
+ pred = curr_total_n_fit * math.exp(-k * t / YEAR)
205
+ loss += (curr_total_n_fit * P_fit - pred) ** 2
206
+ if curr_total_n_fit > 0:
207
+ P_fit *= 1 + delta_k / curr_total_n_fit
208
+ curr_total_n_fit += delta_n
209
+ return loss
210
+
211
+ k_opt = scipy.optimize.fmin(fit, 0.5, maxiter=50, disp=False)[0]
212
+
213
+ ts_fit = np.linspace(0, years, 100)
214
+ ys_fit = [100.0 * math.exp(-k_opt * t) for t in ts_fit]
215
+
216
+ half_life = math.log(2) / k_opt
217
+
218
+ fig.add_trace(go.Scatter(
219
+ x=ts_fit, y=ys_fit,
220
+ mode='lines',
221
+ name=f"Exp. Fit (Half-life: {half_life:.2f} yrs)",
222
+ line=dict(color='red', dash='dash')
223
+ ))
224
+
225
+ except ImportError:
226
+ pass # Or warn user
227
+
228
+ fig.update_layout(
229
+ title=dict(text=title, x=0.5) if title else None,
230
+ yaxis=dict(
231
+ title="lines still present (%)",
232
+ range=[0, 100]
233
+ ),
234
+ xaxis=dict(
235
+ title="Years",
236
+ range=[0, years]
237
+ ),
238
+ hovermode="x unified",
239
+ margin=dict(l=20, r=20, t=50, b=20),
240
+ )
241
+
242
+
243
+ return fig
@@ -0,0 +1,98 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # Copyright 2016 Erik Bernhardsson
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import matplotlib
18
+
19
+ matplotlib.use("Agg")
20
+
21
+ import argparse, dateutil.parser, json, numpy, sys
22
+ from matplotlib import pyplot
23
+
24
+ from .utils import generate_n_colors
25
+
26
+
27
+ def stack_plot(
28
+ input_fn_or_data, display=False, outfile=None, max_n=20, normalize=False, title=None
29
+ ):
30
+ if isinstance(input_fn_or_data, str):
31
+ data = json.load(open(input_fn_or_data))
32
+ else:
33
+ data = input_fn_or_data
34
+ y = numpy.array(data["y"])
35
+ if y.shape[0] > max_n:
36
+ js = sorted(range(len(data["labels"])), key=lambda j: max(y[j]), reverse=True)
37
+ other_sum = numpy.sum(y[j] for j in js[max_n:])
38
+ top_js = sorted(js[:max_n], key=lambda j: data["labels"][j])
39
+ y = numpy.array([y[j] for j in top_js] + [other_sum])
40
+ labels = [data["labels"][j] for j in top_js] + ["other"]
41
+ else:
42
+ labels = data["labels"]
43
+ if normalize:
44
+ y = 100.0 * numpy.array(y) / numpy.sum(y, axis=0)
45
+ fig = pyplot.figure(figsize=(16, 12), dpi=120)
46
+ pyplot.style.use("ggplot")
47
+ ts = [dateutil.parser.parse(t) for t in data["ts"]]
48
+ colors = generate_n_colors(len(labels))
49
+ pyplot.stackplot(ts, numpy.array(y), labels=labels, colors=colors)
50
+ pyplot.legend(loc=2)
51
+ if normalize:
52
+ pyplot.ylabel("Share of lines of code (%)")
53
+ pyplot.ylim([0, 100])
54
+ else:
55
+ pyplot.ylabel("Lines of code")
56
+
57
+ if title:
58
+ pyplot.text(0.5, 0.5, title, transform=pyplot.gca().transAxes,
59
+ fontsize=40, color='gray', alpha=0.3,
60
+ ha='center', va='center', rotation=30)
61
+
62
+ if outfile:
63
+ print("Writing output to %s" % outfile)
64
+ pyplot.savefig(outfile)
65
+
66
+ pyplot.tight_layout()
67
+ if display:
68
+ pyplot.show()
69
+
70
+ return fig
71
+
72
+
73
+ def stack_plot_cmdline():
74
+ parser = argparse.ArgumentParser(description="Plot stack plot")
75
+ parser.add_argument("--display", action="store_true", help="Display plot")
76
+ parser.add_argument(
77
+ "--outfile",
78
+ default="stack_plot.png",
79
+ type=str,
80
+ help="Output file to store results (default: %(default)s)",
81
+ )
82
+ parser.add_argument(
83
+ "--max-n",
84
+ default=20,
85
+ type=int,
86
+ help='Max number of dataseries (will roll everything else into "other") (default: %(default)s)',
87
+ )
88
+ parser.add_argument(
89
+ "--normalize", action="store_true", help="Normalize the plot to 100%%"
90
+ )
91
+ parser.add_argument("input_fn")
92
+ kwargs = vars(parser.parse_args())
93
+
94
+ stack_plot(**kwargs)
95
+
96
+
97
+ if __name__ == "__main__":
98
+ stack_plot_cmdline()