gitlytics 0.2.0__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gitlytics-0.2.0 → gitlytics-0.2.2}/PKG-INFO +1 -1
- {gitlytics-0.2.0 → gitlytics-0.2.2}/pyproject.toml +1 -1
- {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/__init__.py +65 -22
- {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/api.py +58 -13
- {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/automation.py +70 -28
- {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/cli.py +58 -26
- {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/core.py +116 -82
- {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/process.py +52 -41
- gitlytics-0.2.2/src/gitlytics/static/assets/index-CEcch8Fa.css +1 -0
- gitlytics-0.2.2/src/gitlytics/static/assets/index-DRUIdkFE.js +87 -0
- gitlytics-0.2.0/src/gitlytics/static/assets/index.es-DZq7ceO3.js → gitlytics-0.2.2/src/gitlytics/static/assets/index.es-BArdAPDI.js +1 -1
- gitlytics-0.2.0/src/gitlytics/static/assets/jspdf.es.min-CaU6ZJCD.js → gitlytics-0.2.2/src/gitlytics/static/assets/jspdf.es.min-RN_PhxQY.js +3 -3
- {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/index.html +2 -2
- {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics.egg-info/PKG-INFO +1 -1
- {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics.egg-info/SOURCES.txt +4 -4
- {gitlytics-0.2.0 → gitlytics-0.2.2}/tests/test_api.py +53 -0
- {gitlytics-0.2.0 → gitlytics-0.2.2}/tests/test_automation.py +26 -0
- {gitlytics-0.2.0 → gitlytics-0.2.2}/tests/test_cli.py +31 -3
- {gitlytics-0.2.0 → gitlytics-0.2.2}/tests/test_core.py +105 -0
- gitlytics-0.2.0/src/gitlytics/static/assets/index-Cx6oOScf.js +0 -87
- gitlytics-0.2.0/src/gitlytics/static/assets/index-DxtMptVs.css +0 -1
- {gitlytics-0.2.0 → gitlytics-0.2.2}/LICENSE +0 -0
- {gitlytics-0.2.0 → gitlytics-0.2.2}/README.md +0 -0
- {gitlytics-0.2.0 → gitlytics-0.2.2}/setup.cfg +0 -0
- {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/__main__.py +0 -0
- {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/android-chrome-192x192.png +0 -0
- {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/android-chrome-512x512.png +0 -0
- {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/apple-touch-icon.png +0 -0
- {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/assets/html2canvas-pro.esm-9xys3ejh.js +0 -0
- {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/assets/html2canvas.esm-DXEQVQnt.js +0 -0
- {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/assets/purify.es-CC4Brkr_.js +0 -0
- {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/favicon-16x16.png +0 -0
- {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/favicon-32x32.png +0 -0
- {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/favicon-48x48.png +0 -0
- {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/favicon.ico +0 -0
- {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/gitlytics-logo.png +0 -0
- {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/google-search-icon-48x48.png +0 -0
- {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/google-search-icon.png +0 -0
- {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/robots.txt +0 -0
- {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/sitemap.xml +0 -0
- {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics.egg-info/dependency_links.txt +0 -0
- {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics.egg-info/entry_points.txt +0 -0
- {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics.egg-info/requires.txt +0 -0
- {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics.egg-info/top_level.txt +0 -0
- {gitlytics-0.2.0 → gitlytics-0.2.2}/tests/test_process.py +0 -0
- {gitlytics-0.2.0 → gitlytics-0.2.2}/tests/test_username.py +0 -0
|
@@ -5,10 +5,11 @@ The public API for the gitlytics package.
|
|
|
5
5
|
import os
|
|
6
6
|
import logging
|
|
7
7
|
import json
|
|
8
|
+
from pathlib import Path
|
|
8
9
|
|
|
9
10
|
# Single source of truth for the package version.
|
|
10
11
|
# Mirrors the version in pyproject.toml — keep them in sync.
|
|
11
|
-
__version__ = "0.2.
|
|
12
|
+
__version__ = "0.2.2"
|
|
12
13
|
|
|
13
14
|
__all__ = ["fetch_traffic", "sync", "serve_dashboard", "__version__"]
|
|
14
15
|
|
|
@@ -22,6 +23,44 @@ logger = logging.getLogger(__name__)
|
|
|
22
23
|
logger.addHandler(logging.NullHandler())
|
|
23
24
|
|
|
24
25
|
|
|
26
|
+
_VALID_METRICS = {
|
|
27
|
+
"views", "clones", "referrers", "paths", "stars", "forks",
|
|
28
|
+
"language", "topics", "watchers_count", "pushed_at",
|
|
29
|
+
"created_at", "open_issues_count",
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _coerce_metrics(metrics):
|
|
34
|
+
# Accept list, tuple, or set. Reject strings, ints, dicts, etc.
|
|
35
|
+
if metrics is None:
|
|
36
|
+
return None
|
|
37
|
+
if isinstance(metrics, str):
|
|
38
|
+
raise ValueError(
|
|
39
|
+
"metrics must be a list/tuple/set of strings, not a single string. "
|
|
40
|
+
"Pass ['views', 'clones'] instead of 'views clones'."
|
|
41
|
+
)
|
|
42
|
+
if isinstance(metrics, (list, tuple, set, frozenset)):
|
|
43
|
+
result = list(metrics)
|
|
44
|
+
for m in result:
|
|
45
|
+
if not isinstance(m, str):
|
|
46
|
+
raise ValueError(f"metrics entries must be strings; got {type(m).__name__}.")
|
|
47
|
+
# Drop unknown metric names with a warning — better than crashing mid-fetch.
|
|
48
|
+
unknown = [m for m in result if m not in _VALID_METRICS]
|
|
49
|
+
if unknown:
|
|
50
|
+
logger.warning(f"Unknown metrics ignored: {unknown}")
|
|
51
|
+
result = [m for m in result if m in _VALID_METRICS]
|
|
52
|
+
return result
|
|
53
|
+
raise ValueError(f"metrics must be a list/tuple/set; got {type(metrics).__name__}.")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _write_json_safely(path: str, payload: dict) -> None:
|
|
57
|
+
# Create the parent dir so the user can pass any nested path.
|
|
58
|
+
p = Path(path)
|
|
59
|
+
p.parent.mkdir(parents=True, exist_ok=True)
|
|
60
|
+
with open(p, "w", encoding="utf-8") as f:
|
|
61
|
+
json.dump(payload, f, indent=2, ensure_ascii=False)
|
|
62
|
+
|
|
63
|
+
|
|
25
64
|
def fetch_traffic(token: str, repo_name=None, print_table: bool = False, return_format: str = "dataframe", save_file: str = None, metrics: list = None):
|
|
26
65
|
"""
|
|
27
66
|
Fetches the last 14 days of traffic data for one or all repositories.
|
|
@@ -43,6 +82,10 @@ def fetch_traffic(token: str, repo_name=None, print_table: bool = False, return_
|
|
|
43
82
|
A ``pandas.DataFrame`` when ``return_format="dataframe"``, otherwise
|
|
44
83
|
a ``dict`` matching the requested format.
|
|
45
84
|
"""
|
|
85
|
+
# Strip surrounding whitespace from the token (matches api._get_token).
|
|
86
|
+
token = (token or "").strip() if isinstance(token, str) else token
|
|
87
|
+
metrics = _coerce_metrics(metrics)
|
|
88
|
+
|
|
46
89
|
# Hit the GitHub API and get back a tidy DataFrame (one row per day per repo)
|
|
47
90
|
df = fetch_traffic_data(token, repo_name, metrics)
|
|
48
91
|
|
|
@@ -54,13 +97,14 @@ def fetch_traffic(token: str, repo_name=None, print_table: bool = False, return_
|
|
|
54
97
|
if return_format == "dataframe":
|
|
55
98
|
if save_file:
|
|
56
99
|
if save_file.endswith(".json"):
|
|
57
|
-
# Save as a chart-ready JSON file
|
|
100
|
+
# Save as a chart-ready JSON file (always public-only when exported to disk)
|
|
58
101
|
payload = build_json_payload(df, return_format="timeseries", export_public_only=True)
|
|
59
|
-
|
|
60
|
-
json.dump(payload, f, indent=2)
|
|
102
|
+
_write_json_safely(save_file, payload)
|
|
61
103
|
else:
|
|
62
104
|
# Save as a standard CSV file
|
|
63
|
-
|
|
105
|
+
p = Path(save_file)
|
|
106
|
+
p.parent.mkdir(parents=True, exist_ok=True)
|
|
107
|
+
df.to_csv(p, index=False)
|
|
64
108
|
return df
|
|
65
109
|
|
|
66
110
|
# Reject anything that isn't a known format before doing more work
|
|
@@ -71,13 +115,14 @@ def fetch_traffic(token: str, repo_name=None, print_table: bool = False, return_
|
|
|
71
115
|
f"Choose one of: 'dataframe', 'timeseries', 'summary'."
|
|
72
116
|
)
|
|
73
117
|
|
|
74
|
-
# Build the JSON-serialisable payload in the requested shape
|
|
75
|
-
|
|
118
|
+
# Build the JSON-serialisable payload in the requested shape.
|
|
119
|
+
# When persisting to disk, strip private repos by default (security firewall).
|
|
120
|
+
export_public_only = bool(save_file)
|
|
121
|
+
payload = build_json_payload(df, return_format=return_format, export_public_only=export_public_only)
|
|
76
122
|
|
|
77
123
|
# Save to disk if the user gave us a file path
|
|
78
124
|
if save_file:
|
|
79
|
-
|
|
80
|
-
json.dump(payload, f, indent=2)
|
|
125
|
+
_write_json_safely(save_file, payload)
|
|
81
126
|
|
|
82
127
|
return payload
|
|
83
128
|
|
|
@@ -98,10 +143,12 @@ def sync(token: str, repo_name=None, data_dir: str = "./data", output_mode: str
|
|
|
98
143
|
exported JSON — acts as a security firewall.
|
|
99
144
|
metrics: Optional list of metrics to fetch (e.g., ``["views", "clones"]``).
|
|
100
145
|
"""
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
146
|
+
# Strip the token. Resolve the data_dir to an absolute path but NEVER redirect
|
|
147
|
+
# to a sibling directory — respect the user's CWD.
|
|
148
|
+
token = (token or "").strip() if isinstance(token, str) else token
|
|
149
|
+
metrics = _coerce_metrics(metrics)
|
|
150
|
+
if data_dir:
|
|
151
|
+
data_dir = str(Path(data_dir).expanduser().resolve())
|
|
105
152
|
|
|
106
153
|
# Hand off to the automation engine — it handles deduplication and schema migration
|
|
107
154
|
run_sync(
|
|
@@ -145,17 +192,13 @@ def serve_dashboard(host: str = "127.0.0.1", port: int = 8000, token: str = None
|
|
|
145
192
|
_orig_data_dir = os.environ.get("GITLYTICS_DATA_DIR")
|
|
146
193
|
try:
|
|
147
194
|
if token:
|
|
148
|
-
os.environ["GITLYTICS_TOKEN"] = token
|
|
195
|
+
os.environ["GITLYTICS_TOKEN"] = (token or "").strip()
|
|
149
196
|
if data_dir:
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
if not
|
|
153
|
-
parent_dir = os.path.abspath(os.path.join("..", data_dir))
|
|
154
|
-
if os.path.exists(parent_dir):
|
|
155
|
-
abs_data_dir = parent_dir
|
|
156
|
-
if not os.path.exists(abs_data_dir):
|
|
197
|
+
abs_data_dir = str(Path(data_dir).expanduser().resolve())
|
|
198
|
+
p = Path(abs_data_dir)
|
|
199
|
+
if not p.exists():
|
|
157
200
|
print(f"⚠️ Warning: The specified data directory '{data_dir}' (resolved to '{abs_data_dir}') does not exist.")
|
|
158
|
-
elif not any(
|
|
201
|
+
elif not any(p.glob("traffic_*.csv")):
|
|
159
202
|
print(f"⚠️ Warning: No traffic_*.csv database files found in '{data_dir}' (resolved to '{abs_data_dir}').")
|
|
160
203
|
os.environ["GITLYTICS_DATA_DIR"] = abs_data_dir
|
|
161
204
|
uvicorn.run("gitlytics.api:app", host=host, port=port, reload=False)
|
|
@@ -5,7 +5,9 @@ Powers the FastAPI backend — serves traffic data and the React dashboard to th
|
|
|
5
5
|
import hashlib
|
|
6
6
|
import logging
|
|
7
7
|
import os
|
|
8
|
+
import shutil
|
|
8
9
|
import time as _time
|
|
10
|
+
import uuid
|
|
9
11
|
from pathlib import Path
|
|
10
12
|
|
|
11
13
|
import pandas as pd
|
|
@@ -28,15 +30,14 @@ logger = logging.getLogger(__name__)
|
|
|
28
30
|
|
|
29
31
|
app = FastAPI(title="GitHub Traffic API")
|
|
30
32
|
|
|
31
|
-
# Only allow requests from localhost — never deployed publicly
|
|
33
|
+
# Only allow requests from localhost — never deployed publicly.
|
|
34
|
+
# Vite's dev port (5173) is intentionally excluded from the production allowlist.
|
|
32
35
|
_ALLOWED_ORIGINS = [
|
|
33
36
|
"http://localhost",
|
|
34
37
|
"http://localhost:3000",
|
|
35
|
-
"http://localhost:5173",
|
|
36
38
|
"http://localhost:8000",
|
|
37
39
|
"http://127.0.0.1",
|
|
38
40
|
"http://127.0.0.1:3000",
|
|
39
|
-
"http://127.0.0.1:5173",
|
|
40
41
|
"http://127.0.0.1:8000",
|
|
41
42
|
]
|
|
42
43
|
|
|
@@ -45,13 +46,18 @@ app.add_middleware(
|
|
|
45
46
|
allow_origins=_ALLOWED_ORIGINS,
|
|
46
47
|
allow_credentials=True,
|
|
47
48
|
allow_methods=["GET", "POST"],
|
|
48
|
-
|
|
49
|
+
# `Authorization` is required for cross-origin deployments that pass the
|
|
50
|
+
# GitHub PAT in a header. Same-origin requests are unaffected.
|
|
51
|
+
allow_headers=["Content-Type", "Authorization"],
|
|
49
52
|
)
|
|
50
53
|
|
|
51
54
|
|
|
52
55
|
_auth_cache: dict = {} # sha256_prefix -> (valid, username, expires_at)
|
|
53
56
|
_AUTH_CACHE_TTL = 300 # 5 minutes
|
|
54
57
|
|
|
58
|
+
# Hard cap on CSV upload size — prevents DoS via /api/upload-csv.
|
|
59
|
+
_MAX_UPLOAD_BYTES = 25 * 1024 * 1024
|
|
60
|
+
|
|
55
61
|
|
|
56
62
|
def _get_token(token: str = None) -> str:
|
|
57
63
|
# C-2: explicit empty string must not fall through to the env token
|
|
@@ -62,14 +68,14 @@ def _get_token(token: str = None) -> str:
|
|
|
62
68
|
|
|
63
69
|
def _validate_token_cached(token: str):
|
|
64
70
|
# M-1: cache validation results to avoid a double HTTP round-trip on every /api/traffic call
|
|
71
|
+
from gitlytics.core import validate_token as _validate_token
|
|
65
72
|
key = hashlib.sha256(token.encode()).hexdigest()[:16]
|
|
66
73
|
now = _time.time()
|
|
67
74
|
if key in _auth_cache:
|
|
68
75
|
valid, username, expires = _auth_cache[key]
|
|
69
76
|
if now < expires:
|
|
70
77
|
return valid, username
|
|
71
|
-
|
|
72
|
-
valid, username = validate_token(token)
|
|
78
|
+
valid, username = _validate_token(token)
|
|
73
79
|
_auth_cache[key] = (valid, username, now + _AUTH_CACHE_TTL)
|
|
74
80
|
return valid, username
|
|
75
81
|
|
|
@@ -115,10 +121,22 @@ def get_username_data(username: str = Body("", embed=True)):
|
|
|
115
121
|
raise HTTPException(status_code=400, detail="Username is required.")
|
|
116
122
|
try:
|
|
117
123
|
profile = get_public_user(username.strip())
|
|
124
|
+
# Distinguish "user found" (login matches the request) from "GitHub failed
|
|
125
|
+
# and we returned a stub". Returning 200 for the latter would silently mask
|
|
126
|
+
# upstream outages.
|
|
127
|
+
if profile.get("login") == username.strip() and profile.get("html_url"):
|
|
128
|
+
pass
|
|
129
|
+
elif profile.get("login") == username.strip():
|
|
130
|
+
# Found but GitHub didn't include html_url — still treat as success.
|
|
131
|
+
pass
|
|
132
|
+
else:
|
|
133
|
+
raise HTTPException(status_code=502, detail="GitHub did not return a profile.")
|
|
118
134
|
repos = get_public_repos(username.strip())
|
|
119
135
|
return {"profile": profile, "repos": repos}
|
|
120
136
|
except ValueError as exc:
|
|
121
137
|
raise HTTPException(status_code=404, detail=str(exc))
|
|
138
|
+
except HTTPException:
|
|
139
|
+
raise
|
|
122
140
|
except Exception as exc:
|
|
123
141
|
logger.warning(f"Username fetch failed for {username}: {exc}")
|
|
124
142
|
raise HTTPException(status_code=500, detail="Failed to fetch GitHub data.")
|
|
@@ -170,9 +188,14 @@ def get_traffic(token: str = Body("", embed=True)):
|
|
|
170
188
|
|
|
171
189
|
df = df.replace([float('inf'), float('-inf')], None).where(pd.notnull(df), None)
|
|
172
190
|
|
|
173
|
-
# Build a quick view-sum map to find the top 20 repos for deep fetching
|
|
191
|
+
# Build a quick view-sum map to find the top 20 repos for deep fetching.
|
|
192
|
+
# Guard against the subset-metrics case where `views` may be absent.
|
|
174
193
|
repos_with_views = []
|
|
175
|
-
if
|
|
194
|
+
if (
|
|
195
|
+
not df.empty
|
|
196
|
+
and "repository" in df.columns
|
|
197
|
+
and "views" in df.columns
|
|
198
|
+
):
|
|
176
199
|
for repo_name, group in df.groupby("repository"):
|
|
177
200
|
repos_with_views.append({"repository": repo_name, "total_views": int(group["views"].sum())})
|
|
178
201
|
|
|
@@ -192,16 +215,34 @@ def upload_csv(file: UploadFile = File(...)):
|
|
|
192
215
|
if data_dir:
|
|
193
216
|
data_dir_path = Path(data_dir)
|
|
194
217
|
data_dir_path.mkdir(parents=True, exist_ok=True)
|
|
218
|
+
# Stream the upload to disk so we don't buffer the whole file in
|
|
219
|
+
# memory. Enforce the size cap on the way in.
|
|
220
|
+
dest = data_dir_path / f"traffic_uploaded_{uuid.uuid4().hex}.csv"
|
|
221
|
+
total = 0
|
|
222
|
+
with open(dest, "wb") as out:
|
|
223
|
+
while True:
|
|
224
|
+
chunk = file.file.read(1024 * 1024)
|
|
225
|
+
if not chunk:
|
|
226
|
+
break
|
|
227
|
+
total += len(chunk)
|
|
228
|
+
if total > _MAX_UPLOAD_BYTES:
|
|
229
|
+
out.close()
|
|
230
|
+
try:
|
|
231
|
+
dest.unlink()
|
|
232
|
+
except OSError:
|
|
233
|
+
pass
|
|
234
|
+
raise HTTPException(
|
|
235
|
+
status_code=413,
|
|
236
|
+
detail=f"CSV too large. Maximum size is {_MAX_UPLOAD_BYTES // (1024 * 1024)} MB.",
|
|
237
|
+
)
|
|
238
|
+
out.write(chunk)
|
|
195
239
|
file.file.seek(0)
|
|
196
|
-
content = file.file.read()
|
|
197
|
-
file.file.seek(0)
|
|
198
|
-
dest = data_dir_path / f"traffic_uploaded_{int(_time.time())}.csv"
|
|
199
|
-
with open(dest, "wb") as f:
|
|
200
|
-
f.write(content)
|
|
201
240
|
df = process_uploaded_csv(file.file)
|
|
202
241
|
df = df.replace([float('inf'), float('-inf')], None).where(pd.notnull(df), None)
|
|
203
242
|
payload = build_react_payload(df, deep_stats=None)
|
|
204
243
|
return payload
|
|
244
|
+
except HTTPException:
|
|
245
|
+
raise
|
|
205
246
|
except Exception as e:
|
|
206
247
|
raise HTTPException(status_code=400, detail=str(e))
|
|
207
248
|
|
|
@@ -224,6 +265,10 @@ def serve_index():
|
|
|
224
265
|
@app.get("/{full_path:path}")
|
|
225
266
|
def serve_spa_fallback(full_path: str):
|
|
226
267
|
"""SPA catch-all — returns index.html so React Router handles navigation."""
|
|
268
|
+
# Any unhandled /api/* path is a real 404, not a SPA route.
|
|
269
|
+
if full_path.startswith("api/"):
|
|
270
|
+
return JSONResponse(status_code=404, content={"error": "Not found."})
|
|
271
|
+
|
|
227
272
|
asset_file = frontend_dir / full_path
|
|
228
273
|
if asset_file.exists() and asset_file.is_file():
|
|
229
274
|
return FileResponse(asset_file)
|
|
@@ -74,7 +74,14 @@ def export_json_database(data_dir: str, export_path: str, export_public_only: bo
|
|
|
74
74
|
export_file.parent.mkdir(parents=True, exist_ok=True)
|
|
75
75
|
|
|
76
76
|
with open(export_file, "w", encoding="utf-8") as f:
|
|
77
|
-
json.dump(payload, f, indent=2)
|
|
77
|
+
json.dump(payload, f, indent=2, ensure_ascii=False)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _normalize_field(name: str) -> str:
|
|
81
|
+
# Strip BOM, lower-case, and trim so older CSVs match the new schema.
|
|
82
|
+
if not isinstance(name, str):
|
|
83
|
+
return ""
|
|
84
|
+
return name.lstrip("").strip().lower()
|
|
78
85
|
|
|
79
86
|
|
|
80
87
|
def _merge_schema(existing_fields: list, new_fields: list) -> list:
|
|
@@ -83,18 +90,30 @@ def _merge_schema(existing_fields: list, new_fields: list) -> list:
|
|
|
83
90
|
Existing fields keep their original order; new fields are appended at the end
|
|
84
91
|
so historical rows stay compatible with the updated schema.
|
|
85
92
|
"""
|
|
86
|
-
|
|
93
|
+
# Normalize on the fly so 'Date' and 'date' are treated identically.
|
|
94
|
+
norm_existing = [_normalize_field(f) for f in existing_fields]
|
|
95
|
+
merged_norm = list(norm_existing)
|
|
96
|
+
original_for_norm = list(existing_fields)
|
|
87
97
|
for col in new_fields:
|
|
88
|
-
|
|
98
|
+
n = _normalize_field(col)
|
|
99
|
+
if n and n not in merged_norm:
|
|
89
100
|
# A new column appeared in the API response — add it so it gets saved
|
|
90
101
|
logger.info(f"Schema upgrade: adding new column '{col}' to existing CSV.")
|
|
91
|
-
|
|
92
|
-
|
|
102
|
+
merged_norm.append(n)
|
|
103
|
+
original_for_norm.append(col)
|
|
104
|
+
# Return the original-cased names where possible so the CSV header stays human-readable.
|
|
105
|
+
return original_for_norm
|
|
93
106
|
|
|
94
107
|
|
|
95
108
|
def run_sync_cycle(token: str, repo_names=None, data_dir="./data", output_mode="monthly", export_json=None, export_public_only=True, metrics: list = None):
|
|
96
109
|
# Fetch fresh traffic data from GitHub
|
|
97
110
|
df = fetch_traffic_data(token, repo_names, metrics)
|
|
111
|
+
|
|
112
|
+
# Always regenerate the export if requested, even when fresh data is empty,
|
|
113
|
+
# so the export file never becomes silently stale.
|
|
114
|
+
if export_json:
|
|
115
|
+
export_json_database(data_dir, export_json, export_public_only)
|
|
116
|
+
|
|
98
117
|
if df.empty:
|
|
99
118
|
logger.info("No traffic data found to sync.")
|
|
100
119
|
return
|
|
@@ -108,18 +127,19 @@ def run_sync_cycle(token: str, repo_names=None, data_dir="./data", output_mode="
|
|
|
108
127
|
|
|
109
128
|
if file_exists:
|
|
110
129
|
# Read the existing column headers so we can migrate the schema if needed
|
|
111
|
-
with open(csv_path, "r", encoding="utf-8") as f:
|
|
130
|
+
with open(csv_path, "r", encoding="utf-8-sig", newline="") as f:
|
|
112
131
|
reader = csv.reader(f)
|
|
113
132
|
try:
|
|
114
133
|
existing_fields = next(reader)
|
|
115
134
|
except StopIteration:
|
|
116
135
|
pass
|
|
117
136
|
|
|
118
|
-
# Load all existing rows
|
|
137
|
+
# Load all existing rows in one pass — avoid iterrows() on large files.
|
|
119
138
|
try:
|
|
120
139
|
existing_df = pd.read_csv(csv_path)
|
|
121
|
-
for
|
|
122
|
-
|
|
140
|
+
for record in existing_df.to_dict("records"):
|
|
141
|
+
key = (str(record.get("repository", "")), str(record.get("date", "")))
|
|
142
|
+
existing_data[key] = record
|
|
123
143
|
except Exception as exc:
|
|
124
144
|
logger.warning(f"Could not read existing CSV '{csv_path}': {exc}. Starting fresh.")
|
|
125
145
|
|
|
@@ -132,24 +152,32 @@ def run_sync_cycle(token: str, repo_names=None, data_dir="./data", output_mode="
|
|
|
132
152
|
|
|
133
153
|
# Merge fresh data into existing rows — preserves columns not present in this sync run
|
|
134
154
|
new_records_added = 0
|
|
135
|
-
for
|
|
136
|
-
key = (str(
|
|
155
|
+
for record in df.to_dict("records"):
|
|
156
|
+
key = (str(record.get("repository", "")), str(record.get("date", "")))
|
|
137
157
|
if key not in existing_data:
|
|
138
158
|
new_records_added += 1
|
|
139
|
-
existing_data[key] =
|
|
159
|
+
existing_data[key] = record
|
|
140
160
|
else:
|
|
141
|
-
existing_data[key].update(
|
|
161
|
+
existing_data[key].update(record)
|
|
142
162
|
|
|
143
163
|
# Sort all rows by date and repo name before writing back to disk
|
|
144
164
|
final_rows = []
|
|
145
165
|
for v in existing_data.values():
|
|
146
166
|
# Fill any missing schema columns with empty strings for old rows
|
|
147
|
-
clean_row = {
|
|
167
|
+
clean_row = {}
|
|
168
|
+
for k in existing_fields:
|
|
169
|
+
nk = _normalize_field(k)
|
|
170
|
+
if nk in v:
|
|
171
|
+
clean_row[k] = v[nk]
|
|
172
|
+
else:
|
|
173
|
+
# Try to match by normalized name to old-style keys in the dict.
|
|
174
|
+
match = next((orig for orig in v.keys() if _normalize_field(orig) == nk), None)
|
|
175
|
+
clean_row[k] = v[match] if match is not None else ""
|
|
148
176
|
final_rows.append(clean_row)
|
|
149
177
|
|
|
150
|
-
final_rows.sort(key=lambda x: (x.get("date", ""), x.get("repository", "")))
|
|
178
|
+
final_rows.sort(key=lambda x: (str(x.get("date", "")), str(x.get("repository", ""))))
|
|
151
179
|
|
|
152
|
-
# Write everything back to the CSV file
|
|
180
|
+
# Write everything back to the CSV file
|
|
153
181
|
with open(csv_path, "w", newline="", encoding="utf-8") as f:
|
|
154
182
|
writer = csv.DictWriter(f, fieldnames=existing_fields, extrasaction='ignore')
|
|
155
183
|
writer.writeheader()
|
|
@@ -157,10 +185,15 @@ def run_sync_cycle(token: str, repo_names=None, data_dir="./data", output_mode="
|
|
|
157
185
|
|
|
158
186
|
logger.info(f"Successfully processed traffic data. Added {new_records_added} new daily records to {csv_path}")
|
|
159
187
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
188
|
+
|
|
189
|
+
def _interruptible_sleep(total_seconds: float) -> None:
|
|
190
|
+
# Sleep in short chunks so SIGINT/SIGTERM (Ctrl-C, docker stop) is responsive.
|
|
191
|
+
chunk = 1.0
|
|
192
|
+
elapsed = 0.0
|
|
193
|
+
while elapsed < total_seconds:
|
|
194
|
+
remaining = total_seconds - elapsed
|
|
195
|
+
time.sleep(min(chunk, remaining))
|
|
196
|
+
elapsed += chunk
|
|
164
197
|
|
|
165
198
|
|
|
166
199
|
def run_sync(token: str, repo_names=None, data_dir="./data", output_mode="monthly", schedule_cron=None, export_json=None, export_public_only=True, metrics: list = None):
|
|
@@ -173,7 +206,7 @@ def run_sync(token: str, repo_names=None, data_dir="./data", output_mode="monthl
|
|
|
173
206
|
# Parse the cron expression — fail early if the format is wrong
|
|
174
207
|
try:
|
|
175
208
|
croniter(schedule_cron, datetime.now(timezone.utc)) # validate only
|
|
176
|
-
except ValueError as e:
|
|
209
|
+
except (ValueError, KeyError, TypeError) as e:
|
|
177
210
|
logger.error(f"Invalid cron expression: {e}")
|
|
178
211
|
return
|
|
179
212
|
|
|
@@ -190,21 +223,25 @@ def run_sync(token: str, repo_names=None, data_dir="./data", output_mode="monthl
|
|
|
190
223
|
|
|
191
224
|
sleep_secs = (next_run - now_utc).total_seconds()
|
|
192
225
|
|
|
193
|
-
#
|
|
194
|
-
if sleep_secs
|
|
195
|
-
|
|
196
|
-
|
|
226
|
+
# Guard against negative/zero sleep (clock skew): minimum 1s to avoid hot-looping.
|
|
227
|
+
if sleep_secs <= 0:
|
|
228
|
+
sleep_secs = 1.0
|
|
229
|
+
|
|
230
|
+
logger.info(f"Scheduled next sync for {next_run.strftime('%Y-%m-%d %H:%M:%S UTC')}. Sleeping {sleep_secs:.0f}s...")
|
|
231
|
+
_interruptible_sleep(sleep_secs)
|
|
197
232
|
|
|
198
233
|
try:
|
|
199
234
|
# Always re-validate the token before fetching to catch expiry early
|
|
200
235
|
is_valid, msg = validate_token(token)
|
|
201
236
|
if not is_valid:
|
|
202
237
|
# Distinguish between a dead token (stop forever) and a network blip (retry next cycle)
|
|
238
|
+
msg_lower = msg.lower()
|
|
203
239
|
is_auth_failure = (
|
|
204
240
|
"401" in msg
|
|
205
|
-
or "authentication failed" in
|
|
206
|
-
or "invalid token" in
|
|
207
|
-
or "revoked" in
|
|
241
|
+
or "authentication failed" in msg_lower
|
|
242
|
+
or "invalid token" in msg_lower
|
|
243
|
+
or "revoked" in msg_lower
|
|
244
|
+
or "bad credentials" in msg_lower
|
|
208
245
|
)
|
|
209
246
|
if is_auth_failure:
|
|
210
247
|
logger.critical(
|
|
@@ -217,6 +254,11 @@ def run_sync(token: str, repo_names=None, data_dir="./data", output_mode="monthl
|
|
|
217
254
|
continue
|
|
218
255
|
|
|
219
256
|
run_sync_cycle(token, repo_names, data_dir, output_mode, export_json, export_public_only, metrics)
|
|
257
|
+
except SystemExit:
|
|
258
|
+
raise
|
|
259
|
+
except KeyboardInterrupt:
|
|
260
|
+
logger.info("Daemon interrupted by user. Exiting.")
|
|
261
|
+
raise
|
|
220
262
|
except Exception as e:
|
|
221
263
|
# Don't let a single bad cycle kill the daemon — just log and carry on
|
|
222
264
|
logger.error(f"Daemon encountered unexpected error: {e}. Recovering for next cycle.")
|
|
@@ -6,7 +6,6 @@ Achieves 100% feature parity with the Python API.
|
|
|
6
6
|
import argparse
|
|
7
7
|
import sys
|
|
8
8
|
import os
|
|
9
|
-
import json
|
|
10
9
|
|
|
11
10
|
# Import the three public functions that power every subcommand
|
|
12
11
|
from gitlytics import fetch_traffic, sync, serve_dashboard
|
|
@@ -16,7 +15,14 @@ def parse_repo_names(repo_arg: str):
|
|
|
16
15
|
# Turn "owner/repo1, owner/repo2" into ["owner/repo1", "owner/repo2"], or None if empty
|
|
17
16
|
if not repo_arg:
|
|
18
17
|
return None
|
|
19
|
-
|
|
18
|
+
names = [r.strip() for r in repo_arg.split(",")]
|
|
19
|
+
# Validate the format up front so the user gets a clear error instead of a 404.
|
|
20
|
+
for name in names:
|
|
21
|
+
if "/" not in name:
|
|
22
|
+
raise argparse.ArgumentTypeError(
|
|
23
|
+
f"Invalid repo name {name!r}: expected 'owner/repo' format."
|
|
24
|
+
)
|
|
25
|
+
return names
|
|
20
26
|
|
|
21
27
|
|
|
22
28
|
def main():
|
|
@@ -72,13 +78,14 @@ def main():
|
|
|
72
78
|
|
|
73
79
|
args = parser.parse_args()
|
|
74
80
|
|
|
75
|
-
# Print help and exit if the user didn't give a subcommand
|
|
81
|
+
# Print help and exit cleanly (code 0) if the user didn't give a subcommand.
|
|
76
82
|
if not args.command:
|
|
77
83
|
parser.print_help()
|
|
78
|
-
sys.exit(
|
|
84
|
+
sys.exit(0)
|
|
79
85
|
|
|
80
86
|
# Resolve token: CLI flag wins, then environment variables
|
|
81
|
-
|
|
87
|
+
raw_token = getattr(args, "token", None)
|
|
88
|
+
token = (raw_token or os.environ.get("GITLYTICS_TOKEN") or os.environ.get("GITHUB_TOKEN") or "").strip() or None
|
|
82
89
|
|
|
83
90
|
# fetch and sync both need a token — bail early with a clear message
|
|
84
91
|
if args.command in ["fetch", "sync"] and not token:
|
|
@@ -86,31 +93,56 @@ def main():
|
|
|
86
93
|
sys.exit(1)
|
|
87
94
|
|
|
88
95
|
if args.command == "fetch":
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
96
|
+
try:
|
|
97
|
+
repos = parse_repo_names(args.repo_name)
|
|
98
|
+
except argparse.ArgumentTypeError as exc:
|
|
99
|
+
print(f"❌ Error: {exc}")
|
|
100
|
+
sys.exit(2)
|
|
101
|
+
try:
|
|
102
|
+
result = fetch_traffic(
|
|
103
|
+
token=token,
|
|
104
|
+
repo_name=repos,
|
|
105
|
+
print_table=args.print_table,
|
|
106
|
+
return_format=args.return_format,
|
|
107
|
+
save_file=args.save_file,
|
|
108
|
+
metrics=args.metrics
|
|
109
|
+
)
|
|
110
|
+
except ValueError as exc:
|
|
111
|
+
print(f"❌ Error: {exc}")
|
|
112
|
+
sys.exit(2)
|
|
98
113
|
# Give the user a hint if they didn't ask for any output
|
|
99
114
|
if not args.print_table and not args.save_file:
|
|
100
|
-
|
|
115
|
+
try:
|
|
116
|
+
import pandas as pd
|
|
117
|
+
if isinstance(result, pd.DataFrame):
|
|
118
|
+
print(f"Fetch successful. {len(result)} row(s) across {result['repository'].nunique() if not result.empty else 0} repo(s).")
|
|
119
|
+
print("First 5 rows:")
|
|
120
|
+
print(result.head().to_string(index=False))
|
|
121
|
+
else:
|
|
122
|
+
print(f"Fetch successful. Use --print-table or --save-file to see results.")
|
|
123
|
+
except Exception:
|
|
124
|
+
print("Fetch successful. Use --print-table or --save-file to see results.")
|
|
101
125
|
|
|
102
126
|
elif args.command == "sync":
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
127
|
+
try:
|
|
128
|
+
repos = parse_repo_names(args.repo_name)
|
|
129
|
+
except argparse.ArgumentTypeError as exc:
|
|
130
|
+
print(f"❌ Error: {exc}")
|
|
131
|
+
sys.exit(2)
|
|
132
|
+
try:
|
|
133
|
+
sync(
|
|
134
|
+
token=token,
|
|
135
|
+
repo_name=repos,
|
|
136
|
+
data_dir=args.data_dir,
|
|
137
|
+
output_mode=args.output_mode,
|
|
138
|
+
schedule_cron=args.schedule_cron,
|
|
139
|
+
export_json=args.export_json,
|
|
140
|
+
export_public_only=args.export_public_only,
|
|
141
|
+
metrics=args.metrics
|
|
142
|
+
)
|
|
143
|
+
except ValueError as exc:
|
|
144
|
+
print(f"❌ Error: {exc}")
|
|
145
|
+
sys.exit(2)
|
|
114
146
|
|
|
115
147
|
elif args.command == "dashboard":
|
|
116
148
|
print(f"\n[Gitlytics] Starting Gitlytics Dashboard on http://{args.host}:{args.port}\n")
|