PyPI - better-git-of-theseus - Versions diffs - 0.4.0__py3-none-any.whl - Mend

better-git-of-theseus 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

better_git_of_theseus-0.4.0.dist-info/METADATA +122 -0
better_git_of_theseus-0.4.0.dist-info/RECORD +15 -0
better_git_of_theseus-0.4.0.dist-info/WHEEL +5 -0
better_git_of_theseus-0.4.0.dist-info/entry_points.txt +6 -0
better_git_of_theseus-0.4.0.dist-info/licenses/LICENSE +201 -0
better_git_of_theseus-0.4.0.dist-info/top_level.txt +1 -0
git_of_theseus/__init__.py +4 -0
git_of_theseus/analyze.py +643 -0
git_of_theseus/app.py +133 -0
git_of_theseus/cmd.py +23 -0
git_of_theseus/line_plot.py +102 -0
git_of_theseus/plotly_plots.py +243 -0
git_of_theseus/stack_plot.py +98 -0
git_of_theseus/survival_plot.py +162 -0
git_of_theseus/utils.py +33 -0

git_of_theseus/app.py ADDED Viewed

@@ -0,0 +1,133 @@
+import streamlit as st
+import os
+import tempfile
+import shutil
+try:
+    from git_of_theseus.analyze import analyze
+    from git_of_theseus.plotly_plots import plotly_stack_plot, plotly_line_plot, plotly_survival_plot
+except ImportError:
+    from analyze import analyze
+    from plotly_plots import plotly_stack_plot, plotly_line_plot, plotly_survival_plot
+st.set_page_config(page_title="Git of Theseus Dash", layout="wide")
+st.title("📊 Git of Theseus - Repository Analysis")
+import sys
+# Sidebar Configuration
+st.sidebar.header("Configuration")
+default_repo = "."
+if len(sys.argv) > 1:
+    default_repo = sys.argv[1]
+repo_path = st.sidebar.text_input("Git Repository Path", value=default_repo)
+branch = st.sidebar.text_input("Branch", value="master")
+with st.sidebar.expander("Analysis Parameters"):
+    cohortfm = st.text_input(
+        "Cohort Format",
+        value="%Y",
+        help="Python strftime format string. Common options:\n\n"
+             "- `%Y`: Year (e.g., 2023)\n"
+             "- `%Y-%m`: Month (e.g., 2023-01)\n"
+             "- `%Y-W%W`: Week (e.g., 2023-W01)\n"
+             "- `%Y-%m-%d`: Day"
+    )
+    interval = st.number_input("Interval (seconds)", value=7 * 24 * 60 * 60)
+    procs = st.number_input("Processes", value=2, min_value=1)
+    ignore = st.text_area("Ignore (comma separated)").split(",")
+    ignore = [i.strip() for i in ignore if i.strip()]
+@st.cache_data(show_spinner=False)
+def run_analysis(repo_path, branch, cohortfm, interval, procs, ignore):
+    return analyze(
+        repo_path,
+        cohortfm=cohortfm,
+        interval=interval,
+        ignore=ignore,
+        outdir=None,
+        branch=branch,
+        procs=procs,
+        quiet=True
+    )
+# State management for analysis results
+if 'analysis_results' not in st.session_state:
+    st.session_state.analysis_results = None
+if st.sidebar.button("🚀 Run Analysis") or (len(sys.argv) > 1 and st.session_state.analysis_results is None):
+    with st.spinner("Analyzing repository... this may take a while."):
+        try:
+            st.session_state.analysis_results = run_analysis(
+                repo_path, branch, cohortfm, interval, procs, ignore
+            )
+            st.success("Analysis completed!")
+        except Exception as e:
+            st.error(f"Analysis failed: {e}")
+            st.session_state.analysis_results = None
+# Main View
+if st.session_state.analysis_results:
+    results = st.session_state.analysis_results
+    tab1, tab2, tab3 = st.tabs(["Stack Plot", "Line Plot", "Survival Plot"])
+    with tab1:
+        st.header("Stack Plot")
+        col1, col2 = st.columns([1, 3])
+        with col1:
+            source_map = {
+                "Cohorts": "cohorts",
+                "Authors": "authors",
+                "Extensions": "exts",
+                "Directories": "dirs",
+                "Domains": "domains"
+            }
+            data_source_label = st.selectbox("Data Source", list(source_map.keys()), key="stack_source")
+            data_key = source_map[data_source_label]
+            normalize = st.checkbox("Normalize to 100%", value=False, key="stack_norm")
+            max_n = st.slider("Max Series", 5, 50, 20, key="stack_max_n")
+        with col2:
+            project_name = os.path.basename(os.path.abspath(repo_path))
+            data = results.get(data_key)
+            if data:
+                fig = plotly_stack_plot(data, normalize=normalize, max_n=max_n, title=project_name)
+                st.plotly_chart(fig, use_container_width=True)
+            else:
+                st.warning(f"Data for {data_source_label} not found.")
+    with tab2:
+        st.header("Line Plot")
+        col1, col2 = st.columns([1, 3])
+        with col1:
+            data_source_label_line = st.selectbox("Data Source", list(source_map.keys()), key="line_source")
+            data_key_line = source_map[data_source_label_line]
+            normalize_line = st.checkbox("Normalize to 100%", value=False, key="line_norm")
+            max_n_line = st.slider("Max Series", 5, 50, 20, key="line_max_n")
+        with col2:
+            project_name = os.path.basename(os.path.abspath(repo_path))
+            data_line = results.get(data_key_line)
+            if data_line:
+                fig = plotly_line_plot(data_line, normalize=normalize_line, max_n=max_n_line, title=project_name)
+                st.plotly_chart(fig, use_container_width=True)
+            else:
+                st.warning(f"Data for {data_source_label_line} not found.")
+    with tab3:
+        st.header("Survival Plot")
+        col1, col2 = st.columns([1, 3])
+        with col1:
+            exp_fit = st.checkbox("Exponential Fit", value=False)
+            years = st.slider("Years", 1, 20, 5)
+        with col2:
+            project_name = os.path.basename(os.path.abspath(repo_path))
+            survival_data = results.get("survival")
+            if survival_data:
+                fig = plotly_survival_plot(survival_data, exp_fit=exp_fit, years=years, title=project_name)
+                st.plotly_chart(fig, use_container_width=True)
+            else:
+                st.warning("Survival data not found.")
+else:
+    st.info("👈 Enter a repository path and click 'Run Analysis' to get started.")

git_of_theseus/cmd.py ADDED Viewed

@@ -0,0 +1,23 @@
+import sys
+import os
+import subprocess
+def main():
+    # Get the directory of the current file
+    cmd_dir = os.path.dirname(os.path.abspath(__file__))
+    app_path = os.path.join(cmd_dir, "app.py")
+    # The first argument is the repo path, default to current directory
+    repo_path = sys.argv[1] if len(sys.argv) > 1 else os.getcwd()
+    repo_path = os.path.abspath(repo_path)
+    # Run streamlit
+    # We pass the repo_path as an argument to the streamlit script
+    subprocess.run([
+        sys.executable, "-m", "streamlit", "run",
+        app_path,
+        "--", repo_path
+    ])
+if __name__ == "__main__":
+    main()

git_of_theseus/line_plot.py ADDED Viewed

@@ -0,0 +1,102 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright 2016 Erik Bernhardsson
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import matplotlib
+matplotlib.use("Agg")
+import argparse, dateutil.parser, json, numpy, sys
+from matplotlib import pyplot
+from .utils import generate_n_colors
+def line_plot(
+    input_fn_or_data, display=False, outfile=None, max_n=20, normalize=False, title=None
+):
+    if isinstance(input_fn_or_data, str):
+        data = json.load(open(input_fn_or_data))
+    else:
+        data = input_fn_or_data
+    y = numpy.array(data["y"])
+    y_sums = numpy.sum(y, axis=0)
+    if y.shape[0] > max_n:
+        js = sorted(range(len(data["labels"])), key=lambda j: max(y[j]), reverse=True)
+        top_js = sorted(js[:max_n], key=lambda j: data["labels"][j])
+        y = numpy.array([y[j] for j in top_js])
+        labels = [data["labels"][j] for j in top_js]
+    else:
+        labels = data["labels"]
+    if normalize:
+        y = 100.0 * y / y_sums
+    fig = pyplot.figure(figsize=(16, 12), dpi=120)
+    pyplot.style.use("ggplot")
+    ts = [dateutil.parser.parse(t) for t in data["ts"]]
+    colors = generate_n_colors(len(labels))
+    for color, label, series in zip(colors, labels, y):
+        pyplot.plot(ts, series, color=color, label=label, linewidth=3)
+    pyplot.legend(loc=2)
+    if normalize:
+        pyplot.ylabel("Share of lines of code (%)")
+        pyplot.ylim([0, 100])
+    else:
+        pyplot.ylabel("Lines of code")
+    if title:
+        pyplot.text(0.5, 0.5, title, transform=pyplot.gca().transAxes,
+                    fontsize=40, color='gray', alpha=0.3,
+                    ha='center', va='center', rotation=30)
+    if outfile:
+        print("Writing output to %s" % outfile)
+        pyplot.savefig(outfile)
+    pyplot.tight_layout()
+    if display:
+        pyplot.show()
+    return fig
+def line_plot_cmdline():
+    parser = argparse.ArgumentParser(description="Plot line plot")
+    parser.add_argument("--display", action="store_true", help="Display plot")
+    parser.add_argument(
+        "--outfile",
+        default="line_plot.png",
+        type=str,
+        help="Output file to store results (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--max-n",
+        default=20,
+        type=int,
+        help="Max number of dataseries (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--normalize",
+        action="store_true",
+        help="Plot the share of each, so it adds up to 100%%",
+    )
+    parser.add_argument("input_fn")
+    kwargs = vars(parser.parse_args())
+    line_plot(**kwargs)
+if __name__ == "__main__":
+    line_plot_cmdline()

git_of_theseus/plotly_plots.py ADDED Viewed

@@ -0,0 +1,243 @@
+import plotly.graph_objects as go
+import plotly.express as px
+import numpy as np
+import dateutil.parser
+import collections
+import math
+import os
+from .utils import generate_n_colors
+def _process_stack_line_data(data, max_n=20, normalize=False):
+    # Handle dict or file path
+    # If it's a file path, load it? But app.py passes dict now.
+    # Let's assume dict for now as per app.py refactor.
+    if not isinstance(data, dict):
+         # Fallback if needed, though app.py sends dict
+        import json
+        data = json.load(open(data))
+    y = np.array(data["y"])
+    labels = data["labels"]
+    ts = [dateutil.parser.parse(t) for t in data["ts"]]
+    # Sort and filter top N
+    if y.shape[0] > max_n:
+        # Sort by max value in the series
+        js = sorted(range(len(labels)), key=lambda j: max(y[j]), reverse=True)
+        # Calculate other sum
+        other_indices = js[max_n:]
+        if other_indices:
+            other_sum = np.sum([y[j] for j in other_indices], axis=0)
+            # Top N indices
+            top_js = sorted(js[:max_n], key=lambda j: labels[j])
+            y = np.array([y[j] for j in top_js] + [other_sum])
+            labels = [labels[j] for j in top_js] + ["other"]
+        else:
+            # Should hopefully not happen if shape[0] > max_n
+             pass
+    else:
+        # Sort alphabetically for consistency
+        js = range(len(labels))
+        # strictly speaking existing code didn't sort if <= max_n?
+        # "labels = data['labels']" in existing code.
+        pass
+    y_sums = np.sum(y, axis=0)
+    # Avoid division by zero
+    y_sums[y_sums == 0] = 1.0
+    if normalize:
+        y = 100.0 * y / y_sums
+    return ts, y, labels
+def plotly_stack_plot(data, max_n=20, normalize=False, title=None):
+    ts, y, labels = _process_stack_line_data(data, max_n, normalize)
+    fig = go.Figure()
+    # Use a nice color palette
+    colors = px.colors.qualitative.Plotly
+    if len(labels) > len(colors):
+        colors = px.colors.qualitative.Dark24 # More colors if needed
+    for i, label in enumerate(labels):
+        color = colors[i % len(colors)]
+        fig.add_trace(go.Scatter(
+            x=ts,
+            y=y[i],
+            mode='lines',
+            name=label,
+            stackgroup='one', # This enables stacking
+            line=dict(width=0.5, color=color),
+            fillcolor=color # Optional: specific fill color
+        ))
+    fig.update_layout(
+        title=dict(text=title, x=0.5) if title else None,
+        yaxis=dict(
+            title="Share of lines of code (%)" if normalize else "Lines of code",
+            range=[0, 100] if normalize else None
+        ),
+        xaxis=dict(title="Date"),
+        hovermode="x unified",
+        margin=dict(l=20, r=20, t=50, b=20),
+    )
+    return fig
+def plotly_line_plot(data, max_n=20, normalize=False, title=None):
+    ts, y, labels = _process_stack_line_data(data, max_n, normalize)
+    fig = go.Figure()
+    for i, label in enumerate(labels):
+         fig.add_trace(go.Scatter(
+            x=ts,
+            y=y[i],
+            mode='lines',
+            name=label,
+            line=dict(width=2)
+        ))
+    fig.update_layout(
+        title=dict(text=title, x=0.5) if title else None,
+        yaxis=dict(
+            title="Share of lines of code (%)" if normalize else "Lines of code",
+            range=[0, 100] if normalize else None
+        ),
+        xaxis=dict(title="Date"),
+        hovermode="x unified",
+        margin=dict(l=20, r=20, t=50, b=20),
+    )
+    return fig
+def plotly_survival_plot(commit_history, exp_fit=False, years=5, title=None):
+    # Logic copied from survival_plot.py
+    # commit_history is {sha: [[ts, count], ...]}
+    deltas = collections.defaultdict(lambda: np.zeros(2))
+    total_n = 0
+    YEAR = 365.25 * 24 * 60 * 60
+    # Process history
+    # Input might be a list of histories if we support multiple inputs,
+    # but based on app.py we pass a single result["survival"] dict.
+    # However, existing survival_plot took a LIST of filenames.
+    # Let's support the single dict passed from app.py.
+    # The logic in survival_plot.py iterates over input_fns, loads them, and computes `all_deltas`.
+    # Here we assume `commit_history` IS the content of one such file (the dict).
+    for commit, history in commit_history.items():
+        t0, orig_count = history[0]
+        total_n += orig_count
+        last_count = orig_count
+        for t, count in history[1:]:
+            deltas[t - t0] += (count - last_count, 0)
+            last_count = count
+        deltas[history[-1][0] - t0] += (-last_count, -orig_count)
+    # Calculate curve
+    P = 1.0
+    xs = []
+    ys = []
+    # Sort deltas by time
+    sorted_times = sorted(deltas.keys())
+    total_k = total_n # unused?
+    for t in sorted_times:
+        delta_k, delta_n = deltas[t]
+        xs.append(t / YEAR)
+        ys.append(100.0 * P)
+        if total_n > 0:
+             P *= 1 + delta_k / total_n
+        # total_k += delta_k
+        total_n += delta_n
+        if P < 0.05:
+            break
+    fig = go.Figure()
+    # Main survival curve
+    fig.add_trace(go.Scatter(
+        x=xs, y=ys,
+        mode='lines',
+        name='Survival Rate',
+        line=dict(color='blue')
+    ))
+    # Exponential fit
+    if exp_fit:
+        try:
+            import scipy.optimize
+            # Define loss function for fit
+            def fit(k):
+                loss = 0.0
+                # Re-calculate P stream to fit k
+                # Need to iterate again or reuse data?
+                # The original code re-iterates.
+                # Simplified for single dataset:
+                curr_total_n = 0
+                for _, history in commit_history.items():
+                    curr_total_n += history[0][1]
+                P_fit = 1.0
+                curr_total_n_fit = curr_total_n
+                for t in sorted_times:
+                    delta_k, delta_n = deltas[t]
+                    pred = curr_total_n_fit * math.exp(-k * t / YEAR)
+                    loss += (curr_total_n_fit * P_fit - pred) ** 2
+                    if curr_total_n_fit > 0:
+                        P_fit *= 1 + delta_k / curr_total_n_fit
+                    curr_total_n_fit += delta_n
+                return loss
+            k_opt = scipy.optimize.fmin(fit, 0.5, maxiter=50, disp=False)[0]
+            ts_fit = np.linspace(0, years, 100)
+            ys_fit = [100.0 * math.exp(-k_opt * t) for t in ts_fit]
+            half_life = math.log(2) / k_opt
+            fig.add_trace(go.Scatter(
+                x=ts_fit, y=ys_fit,
+                mode='lines',
+                name=f"Exp. Fit (Half-life: {half_life:.2f} yrs)",
+                line=dict(color='red', dash='dash')
+            ))
+        except ImportError:
+            pass # Or warn user
+    fig.update_layout(
+        title=dict(text=title, x=0.5) if title else None,
+        yaxis=dict(
+            title="lines still present (%)",
+            range=[0, 100]
+        ),
+        xaxis=dict(
+            title="Years",
+            range=[0, years]
+        ),
+        hovermode="x unified",
+        margin=dict(l=20, r=20, t=50, b=20),
+    )
+    return fig

git_of_theseus/stack_plot.py ADDED Viewed

@@ -0,0 +1,98 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright 2016 Erik Bernhardsson
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import matplotlib
+matplotlib.use("Agg")
+import argparse, dateutil.parser, json, numpy, sys
+from matplotlib import pyplot
+from .utils import generate_n_colors
+def stack_plot(
+    input_fn_or_data, display=False, outfile=None, max_n=20, normalize=False, title=None
+):
+    if isinstance(input_fn_or_data, str):
+        data = json.load(open(input_fn_or_data))
+    else:
+        data = input_fn_or_data
+    y = numpy.array(data["y"])
+    if y.shape[0] > max_n:
+        js = sorted(range(len(data["labels"])), key=lambda j: max(y[j]), reverse=True)
+        other_sum = numpy.sum(y[j] for j in js[max_n:])
+        top_js = sorted(js[:max_n], key=lambda j: data["labels"][j])
+        y = numpy.array([y[j] for j in top_js] + [other_sum])
+        labels = [data["labels"][j] for j in top_js] + ["other"]
+    else:
+        labels = data["labels"]
+    if normalize:
+        y = 100.0 * numpy.array(y) / numpy.sum(y, axis=0)
+    fig = pyplot.figure(figsize=(16, 12), dpi=120)
+    pyplot.style.use("ggplot")
+    ts = [dateutil.parser.parse(t) for t in data["ts"]]
+    colors = generate_n_colors(len(labels))
+    pyplot.stackplot(ts, numpy.array(y), labels=labels, colors=colors)
+    pyplot.legend(loc=2)
+    if normalize:
+        pyplot.ylabel("Share of lines of code (%)")
+        pyplot.ylim([0, 100])
+    else:
+        pyplot.ylabel("Lines of code")
+    if title:
+        pyplot.text(0.5, 0.5, title, transform=pyplot.gca().transAxes,
+                    fontsize=40, color='gray', alpha=0.3,
+                    ha='center', va='center', rotation=30)
+    if outfile:
+        print("Writing output to %s" % outfile)
+        pyplot.savefig(outfile)
+    pyplot.tight_layout()
+    if display:
+        pyplot.show()
+    return fig
+def stack_plot_cmdline():
+    parser = argparse.ArgumentParser(description="Plot stack plot")
+    parser.add_argument("--display", action="store_true", help="Display plot")
+    parser.add_argument(
+        "--outfile",
+        default="stack_plot.png",
+        type=str,
+        help="Output file to store results (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--max-n",
+        default=20,
+        type=int,
+        help='Max number of dataseries (will roll everything else into "other") (default: %(default)s)',
+    )
+    parser.add_argument(
+        "--normalize", action="store_true", help="Normalize the plot to 100%%"
+    )
+    parser.add_argument("input_fn")
+    kwargs = vars(parser.parse_args())
+    stack_plot(**kwargs)
+if __name__ == "__main__":
+    stack_plot_cmdline()