gitlytics 0.2.0__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {gitlytics-0.2.0 → gitlytics-0.2.2}/PKG-INFO +1 -1
  2. {gitlytics-0.2.0 → gitlytics-0.2.2}/pyproject.toml +1 -1
  3. {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/__init__.py +65 -22
  4. {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/api.py +58 -13
  5. {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/automation.py +70 -28
  6. {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/cli.py +58 -26
  7. {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/core.py +116 -82
  8. {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/process.py +52 -41
  9. gitlytics-0.2.2/src/gitlytics/static/assets/index-CEcch8Fa.css +1 -0
  10. gitlytics-0.2.2/src/gitlytics/static/assets/index-DRUIdkFE.js +87 -0
  11. gitlytics-0.2.0/src/gitlytics/static/assets/index.es-DZq7ceO3.js → gitlytics-0.2.2/src/gitlytics/static/assets/index.es-BArdAPDI.js +1 -1
  12. gitlytics-0.2.0/src/gitlytics/static/assets/jspdf.es.min-CaU6ZJCD.js → gitlytics-0.2.2/src/gitlytics/static/assets/jspdf.es.min-RN_PhxQY.js +3 -3
  13. {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/index.html +2 -2
  14. {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics.egg-info/PKG-INFO +1 -1
  15. {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics.egg-info/SOURCES.txt +4 -4
  16. {gitlytics-0.2.0 → gitlytics-0.2.2}/tests/test_api.py +53 -0
  17. {gitlytics-0.2.0 → gitlytics-0.2.2}/tests/test_automation.py +26 -0
  18. {gitlytics-0.2.0 → gitlytics-0.2.2}/tests/test_cli.py +31 -3
  19. {gitlytics-0.2.0 → gitlytics-0.2.2}/tests/test_core.py +105 -0
  20. gitlytics-0.2.0/src/gitlytics/static/assets/index-Cx6oOScf.js +0 -87
  21. gitlytics-0.2.0/src/gitlytics/static/assets/index-DxtMptVs.css +0 -1
  22. {gitlytics-0.2.0 → gitlytics-0.2.2}/LICENSE +0 -0
  23. {gitlytics-0.2.0 → gitlytics-0.2.2}/README.md +0 -0
  24. {gitlytics-0.2.0 → gitlytics-0.2.2}/setup.cfg +0 -0
  25. {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/__main__.py +0 -0
  26. {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/android-chrome-192x192.png +0 -0
  27. {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/android-chrome-512x512.png +0 -0
  28. {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/apple-touch-icon.png +0 -0
  29. {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/assets/html2canvas-pro.esm-9xys3ejh.js +0 -0
  30. {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/assets/html2canvas.esm-DXEQVQnt.js +0 -0
  31. {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/assets/purify.es-CC4Brkr_.js +0 -0
  32. {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/favicon-16x16.png +0 -0
  33. {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/favicon-32x32.png +0 -0
  34. {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/favicon-48x48.png +0 -0
  35. {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/favicon.ico +0 -0
  36. {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/gitlytics-logo.png +0 -0
  37. {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/google-search-icon-48x48.png +0 -0
  38. {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/google-search-icon.png +0 -0
  39. {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/robots.txt +0 -0
  40. {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics/static/sitemap.xml +0 -0
  41. {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics.egg-info/dependency_links.txt +0 -0
  42. {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics.egg-info/entry_points.txt +0 -0
  43. {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics.egg-info/requires.txt +0 -0
  44. {gitlytics-0.2.0 → gitlytics-0.2.2}/src/gitlytics.egg-info/top_level.txt +0 -0
  45. {gitlytics-0.2.0 → gitlytics-0.2.2}/tests/test_process.py +0 -0
  46. {gitlytics-0.2.0 → gitlytics-0.2.2}/tests/test_username.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gitlytics
3
- Version: 0.2.0
3
+ Version: 0.2.2
4
4
  Summary: Monitor and automate your GitHub repository traffic analytics.
5
5
  Author-email: Ameya Chopade <ameyaccod171@gmail.com>
6
6
  License: Apache-2.0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gitlytics"
7
- version = "0.2.0"
7
+ version = "0.2.2"
8
8
  description = "Monitor and automate your GitHub repository traffic analytics."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -5,10 +5,11 @@ The public API for the gitlytics package.
5
5
  import os
6
6
  import logging
7
7
  import json
8
+ from pathlib import Path
8
9
 
9
10
  # Single source of truth for the package version.
10
11
  # Mirrors the version in pyproject.toml — keep them in sync.
11
- __version__ = "0.2.0"
12
+ __version__ = "0.2.2"
12
13
 
13
14
  __all__ = ["fetch_traffic", "sync", "serve_dashboard", "__version__"]
14
15
 
@@ -22,6 +23,44 @@ logger = logging.getLogger(__name__)
22
23
  logger.addHandler(logging.NullHandler())
23
24
 
24
25
 
26
+ _VALID_METRICS = {
27
+ "views", "clones", "referrers", "paths", "stars", "forks",
28
+ "language", "topics", "watchers_count", "pushed_at",
29
+ "created_at", "open_issues_count",
30
+ }
31
+
32
+
33
+ def _coerce_metrics(metrics):
34
+ # Accept list, tuple, or set. Reject strings, ints, dicts, etc.
35
+ if metrics is None:
36
+ return None
37
+ if isinstance(metrics, str):
38
+ raise ValueError(
39
+ "metrics must be a list/tuple/set of strings, not a single string. "
40
+ "Pass ['views', 'clones'] instead of 'views clones'."
41
+ )
42
+ if isinstance(metrics, (list, tuple, set, frozenset)):
43
+ result = list(metrics)
44
+ for m in result:
45
+ if not isinstance(m, str):
46
+ raise ValueError(f"metrics entries must be strings; got {type(m).__name__}.")
47
+ # Drop unknown metric names with a warning — better than crashing mid-fetch.
48
+ unknown = [m for m in result if m not in _VALID_METRICS]
49
+ if unknown:
50
+ logger.warning(f"Unknown metrics ignored: {unknown}")
51
+ result = [m for m in result if m in _VALID_METRICS]
52
+ return result
53
+ raise ValueError(f"metrics must be a list/tuple/set; got {type(metrics).__name__}.")
54
+
55
+
56
+ def _write_json_safely(path: str, payload: dict) -> None:
57
+ # Create the parent dir so the user can pass any nested path.
58
+ p = Path(path)
59
+ p.parent.mkdir(parents=True, exist_ok=True)
60
+ with open(p, "w", encoding="utf-8") as f:
61
+ json.dump(payload, f, indent=2, ensure_ascii=False)
62
+
63
+
25
64
  def fetch_traffic(token: str, repo_name=None, print_table: bool = False, return_format: str = "dataframe", save_file: str = None, metrics: list = None):
26
65
  """
27
66
  Fetches the last 14 days of traffic data for one or all repositories.
@@ -43,6 +82,10 @@ def fetch_traffic(token: str, repo_name=None, print_table: bool = False, return_
43
82
  A ``pandas.DataFrame`` when ``return_format="dataframe"``, otherwise
44
83
  a ``dict`` matching the requested format.
45
84
  """
85
+ # Strip surrounding whitespace from the token (matches api._get_token).
86
+ token = (token or "").strip() if isinstance(token, str) else token
87
+ metrics = _coerce_metrics(metrics)
88
+
46
89
  # Hit the GitHub API and get back a tidy DataFrame (one row per day per repo)
47
90
  df = fetch_traffic_data(token, repo_name, metrics)
48
91
 
@@ -54,13 +97,14 @@ def fetch_traffic(token: str, repo_name=None, print_table: bool = False, return_
54
97
  if return_format == "dataframe":
55
98
  if save_file:
56
99
  if save_file.endswith(".json"):
57
- # Save as a chart-ready JSON file
100
+ # Save as a chart-ready JSON file (always public-only when exported to disk)
58
101
  payload = build_json_payload(df, return_format="timeseries", export_public_only=True)
59
- with open(save_file, "w", encoding="utf-8") as f:
60
- json.dump(payload, f, indent=2)
102
+ _write_json_safely(save_file, payload)
61
103
  else:
62
104
  # Save as a standard CSV file
63
- df.to_csv(save_file, index=False)
105
+ p = Path(save_file)
106
+ p.parent.mkdir(parents=True, exist_ok=True)
107
+ df.to_csv(p, index=False)
64
108
  return df
65
109
 
66
110
  # Reject anything that isn't a known format before doing more work
@@ -71,13 +115,14 @@ def fetch_traffic(token: str, repo_name=None, print_table: bool = False, return_
71
115
  f"Choose one of: 'dataframe', 'timeseries', 'summary'."
72
116
  )
73
117
 
74
- # Build the JSON-serialisable payload in the requested shape
75
- payload = build_json_payload(df, return_format=return_format, export_public_only=False)
118
+ # Build the JSON-serialisable payload in the requested shape.
119
+ # When persisting to disk, strip private repos by default (security firewall).
120
+ export_public_only = bool(save_file)
121
+ payload = build_json_payload(df, return_format=return_format, export_public_only=export_public_only)
76
122
 
77
123
  # Save to disk if the user gave us a file path
78
124
  if save_file:
79
- with open(save_file, "w", encoding="utf-8") as f:
80
- json.dump(payload, f, indent=2)
125
+ _write_json_safely(save_file, payload)
81
126
 
82
127
  return payload
83
128
 
@@ -98,10 +143,12 @@ def sync(token: str, repo_name=None, data_dir: str = "./data", output_mode: str
98
143
  exported JSON — acts as a security firewall.
99
144
  metrics: Optional list of metrics to fetch (e.g., ``["views", "clones"]``).
100
145
  """
101
- if data_dir and not os.path.isabs(data_dir) and not os.path.exists(data_dir):
102
- parent_dir = os.path.join("..", data_dir)
103
- if os.path.exists(parent_dir):
104
- data_dir = parent_dir
146
+ # Strip the token. Resolve the data_dir to an absolute path but NEVER redirect
147
+ # to a sibling directory — respect the user's CWD.
148
+ token = (token or "").strip() if isinstance(token, str) else token
149
+ metrics = _coerce_metrics(metrics)
150
+ if data_dir:
151
+ data_dir = str(Path(data_dir).expanduser().resolve())
105
152
 
106
153
  # Hand off to the automation engine — it handles deduplication and schema migration
107
154
  run_sync(
@@ -145,17 +192,13 @@ def serve_dashboard(host: str = "127.0.0.1", port: int = 8000, token: str = None
145
192
  _orig_data_dir = os.environ.get("GITLYTICS_DATA_DIR")
146
193
  try:
147
194
  if token:
148
- os.environ["GITLYTICS_TOKEN"] = token
195
+ os.environ["GITLYTICS_TOKEN"] = (token or "").strip()
149
196
  if data_dir:
150
- from pathlib import Path
151
- abs_data_dir = os.path.abspath(data_dir)
152
- if not os.path.exists(abs_data_dir) and not os.path.isabs(data_dir):
153
- parent_dir = os.path.abspath(os.path.join("..", data_dir))
154
- if os.path.exists(parent_dir):
155
- abs_data_dir = parent_dir
156
- if not os.path.exists(abs_data_dir):
197
+ abs_data_dir = str(Path(data_dir).expanduser().resolve())
198
+ p = Path(abs_data_dir)
199
+ if not p.exists():
157
200
  print(f"⚠️ Warning: The specified data directory '{data_dir}' (resolved to '{abs_data_dir}') does not exist.")
158
- elif not any(Path(abs_data_dir).glob("traffic_*.csv")):
201
+ elif not any(p.glob("traffic_*.csv")):
159
202
  print(f"⚠️ Warning: No traffic_*.csv database files found in '{data_dir}' (resolved to '{abs_data_dir}').")
160
203
  os.environ["GITLYTICS_DATA_DIR"] = abs_data_dir
161
204
  uvicorn.run("gitlytics.api:app", host=host, port=port, reload=False)
@@ -5,7 +5,9 @@ Powers the FastAPI backend — serves traffic data and the React dashboard to th
5
5
  import hashlib
6
6
  import logging
7
7
  import os
8
+ import shutil
8
9
  import time as _time
10
+ import uuid
9
11
  from pathlib import Path
10
12
 
11
13
  import pandas as pd
@@ -28,15 +30,14 @@ logger = logging.getLogger(__name__)
28
30
 
29
31
  app = FastAPI(title="GitHub Traffic API")
30
32
 
31
- # Only allow requests from localhost — never deployed publicly
33
+ # Only allow requests from localhost — never deployed publicly.
34
+ # Vite's dev port (5173) is intentionally excluded from the production allowlist.
32
35
  _ALLOWED_ORIGINS = [
33
36
  "http://localhost",
34
37
  "http://localhost:3000",
35
- "http://localhost:5173",
36
38
  "http://localhost:8000",
37
39
  "http://127.0.0.1",
38
40
  "http://127.0.0.1:3000",
39
- "http://127.0.0.1:5173",
40
41
  "http://127.0.0.1:8000",
41
42
  ]
42
43
 
@@ -45,13 +46,18 @@ app.add_middleware(
45
46
  allow_origins=_ALLOWED_ORIGINS,
46
47
  allow_credentials=True,
47
48
  allow_methods=["GET", "POST"],
48
- allow_headers=["Content-Type"],
49
+ # `Authorization` is required for cross-origin deployments that pass the
50
+ # GitHub PAT in a header. Same-origin requests are unaffected.
51
+ allow_headers=["Content-Type", "Authorization"],
49
52
  )
50
53
 
51
54
 
52
55
  _auth_cache: dict = {} # sha256_prefix -> (valid, username, expires_at)
53
56
  _AUTH_CACHE_TTL = 300 # 5 minutes
54
57
 
58
+ # Hard cap on CSV upload size — prevents DoS via /api/upload-csv.
59
+ _MAX_UPLOAD_BYTES = 25 * 1024 * 1024
60
+
55
61
 
56
62
  def _get_token(token: str = None) -> str:
57
63
  # C-2: explicit empty string must not fall through to the env token
@@ -62,14 +68,14 @@ def _get_token(token: str = None) -> str:
62
68
 
63
69
  def _validate_token_cached(token: str):
64
70
  # M-1: cache validation results to avoid a double HTTP round-trip on every /api/traffic call
71
+ from gitlytics.core import validate_token as _validate_token
65
72
  key = hashlib.sha256(token.encode()).hexdigest()[:16]
66
73
  now = _time.time()
67
74
  if key in _auth_cache:
68
75
  valid, username, expires = _auth_cache[key]
69
76
  if now < expires:
70
77
  return valid, username
71
- from gitlytics.core import validate_token
72
- valid, username = validate_token(token)
78
+ valid, username = _validate_token(token)
73
79
  _auth_cache[key] = (valid, username, now + _AUTH_CACHE_TTL)
74
80
  return valid, username
75
81
 
@@ -115,10 +121,22 @@ def get_username_data(username: str = Body("", embed=True)):
115
121
  raise HTTPException(status_code=400, detail="Username is required.")
116
122
  try:
117
123
  profile = get_public_user(username.strip())
124
+ # Distinguish "user found" (login matches the request) from "GitHub failed
125
+ # and we returned a stub". Returning 200 for the latter would silently mask
126
+ # upstream outages.
127
+ if profile.get("login") == username.strip() and profile.get("html_url"):
128
+ pass
129
+ elif profile.get("login") == username.strip():
130
+ # Found but GitHub didn't include html_url — still treat as success.
131
+ pass
132
+ else:
133
+ raise HTTPException(status_code=502, detail="GitHub did not return a profile.")
118
134
  repos = get_public_repos(username.strip())
119
135
  return {"profile": profile, "repos": repos}
120
136
  except ValueError as exc:
121
137
  raise HTTPException(status_code=404, detail=str(exc))
138
+ except HTTPException:
139
+ raise
122
140
  except Exception as exc:
123
141
  logger.warning(f"Username fetch failed for {username}: {exc}")
124
142
  raise HTTPException(status_code=500, detail="Failed to fetch GitHub data.")
@@ -170,9 +188,14 @@ def get_traffic(token: str = Body("", embed=True)):
170
188
 
171
189
  df = df.replace([float('inf'), float('-inf')], None).where(pd.notnull(df), None)
172
190
 
173
- # Build a quick view-sum map to find the top 20 repos for deep fetching
191
+ # Build a quick view-sum map to find the top 20 repos for deep fetching.
192
+ # Guard against the subset-metrics case where `views` may be absent.
174
193
  repos_with_views = []
175
- if not df.empty and "repository" in df.columns and "views" in df.columns:
194
+ if (
195
+ not df.empty
196
+ and "repository" in df.columns
197
+ and "views" in df.columns
198
+ ):
176
199
  for repo_name, group in df.groupby("repository"):
177
200
  repos_with_views.append({"repository": repo_name, "total_views": int(group["views"].sum())})
178
201
 
@@ -192,16 +215,34 @@ def upload_csv(file: UploadFile = File(...)):
192
215
  if data_dir:
193
216
  data_dir_path = Path(data_dir)
194
217
  data_dir_path.mkdir(parents=True, exist_ok=True)
218
+ # Stream the upload to disk so we don't buffer the whole file in
219
+ # memory. Enforce the size cap on the way in.
220
+ dest = data_dir_path / f"traffic_uploaded_{uuid.uuid4().hex}.csv"
221
+ total = 0
222
+ with open(dest, "wb") as out:
223
+ while True:
224
+ chunk = file.file.read(1024 * 1024)
225
+ if not chunk:
226
+ break
227
+ total += len(chunk)
228
+ if total > _MAX_UPLOAD_BYTES:
229
+ out.close()
230
+ try:
231
+ dest.unlink()
232
+ except OSError:
233
+ pass
234
+ raise HTTPException(
235
+ status_code=413,
236
+ detail=f"CSV too large. Maximum size is {_MAX_UPLOAD_BYTES // (1024 * 1024)} MB.",
237
+ )
238
+ out.write(chunk)
195
239
  file.file.seek(0)
196
- content = file.file.read()
197
- file.file.seek(0)
198
- dest = data_dir_path / f"traffic_uploaded_{int(_time.time())}.csv"
199
- with open(dest, "wb") as f:
200
- f.write(content)
201
240
  df = process_uploaded_csv(file.file)
202
241
  df = df.replace([float('inf'), float('-inf')], None).where(pd.notnull(df), None)
203
242
  payload = build_react_payload(df, deep_stats=None)
204
243
  return payload
244
+ except HTTPException:
245
+ raise
205
246
  except Exception as e:
206
247
  raise HTTPException(status_code=400, detail=str(e))
207
248
 
@@ -224,6 +265,10 @@ def serve_index():
224
265
  @app.get("/{full_path:path}")
225
266
  def serve_spa_fallback(full_path: str):
226
267
  """SPA catch-all — returns index.html so React Router handles navigation."""
268
+ # Any unhandled /api/* path is a real 404, not a SPA route.
269
+ if full_path.startswith("api/"):
270
+ return JSONResponse(status_code=404, content={"error": "Not found."})
271
+
227
272
  asset_file = frontend_dir / full_path
228
273
  if asset_file.exists() and asset_file.is_file():
229
274
  return FileResponse(asset_file)
@@ -74,7 +74,14 @@ def export_json_database(data_dir: str, export_path: str, export_public_only: bo
74
74
  export_file.parent.mkdir(parents=True, exist_ok=True)
75
75
 
76
76
  with open(export_file, "w", encoding="utf-8") as f:
77
- json.dump(payload, f, indent=2)
77
+ json.dump(payload, f, indent=2, ensure_ascii=False)
78
+
79
+
80
+ def _normalize_field(name: str) -> str:
81
+ # Strip BOM, lower-case, and trim so older CSVs match the new schema.
82
+ if not isinstance(name, str):
83
+ return ""
84
+ return name.lstrip("").strip().lower()
78
85
 
79
86
 
80
87
  def _merge_schema(existing_fields: list, new_fields: list) -> list:
@@ -83,18 +90,30 @@ def _merge_schema(existing_fields: list, new_fields: list) -> list:
83
90
  Existing fields keep their original order; new fields are appended at the end
84
91
  so historical rows stay compatible with the updated schema.
85
92
  """
86
- merged = list(existing_fields)
93
+ # Normalize on the fly so 'Date' and 'date' are treated identically.
94
+ norm_existing = [_normalize_field(f) for f in existing_fields]
95
+ merged_norm = list(norm_existing)
96
+ original_for_norm = list(existing_fields)
87
97
  for col in new_fields:
88
- if col not in merged:
98
+ n = _normalize_field(col)
99
+ if n and n not in merged_norm:
89
100
  # A new column appeared in the API response — add it so it gets saved
90
101
  logger.info(f"Schema upgrade: adding new column '{col}' to existing CSV.")
91
- merged.append(col)
92
- return merged
102
+ merged_norm.append(n)
103
+ original_for_norm.append(col)
104
+ # Return the original-cased names where possible so the CSV header stays human-readable.
105
+ return original_for_norm
93
106
 
94
107
 
95
108
  def run_sync_cycle(token: str, repo_names=None, data_dir="./data", output_mode="monthly", export_json=None, export_public_only=True, metrics: list = None):
96
109
  # Fetch fresh traffic data from GitHub
97
110
  df = fetch_traffic_data(token, repo_names, metrics)
111
+
112
+ # Always regenerate the export if requested, even when fresh data is empty,
113
+ # so the export file never becomes silently stale.
114
+ if export_json:
115
+ export_json_database(data_dir, export_json, export_public_only)
116
+
98
117
  if df.empty:
99
118
  logger.info("No traffic data found to sync.")
100
119
  return
@@ -108,18 +127,19 @@ def run_sync_cycle(token: str, repo_names=None, data_dir="./data", output_mode="
108
127
 
109
128
  if file_exists:
110
129
  # Read the existing column headers so we can migrate the schema if needed
111
- with open(csv_path, "r", encoding="utf-8") as f:
130
+ with open(csv_path, "r", encoding="utf-8-sig", newline="") as f:
112
131
  reader = csv.reader(f)
113
132
  try:
114
133
  existing_fields = next(reader)
115
134
  except StopIteration:
116
135
  pass
117
136
 
118
- # Load all existing rows into a dict keyed by (repo, date) for deduplication
137
+ # Load all existing rows in one pass avoid iterrows() on large files.
119
138
  try:
120
139
  existing_df = pd.read_csv(csv_path)
121
- for _, row in existing_df.iterrows():
122
- existing_data[(str(row["repository"]), str(row["date"]))] = row.to_dict()
140
+ for record in existing_df.to_dict("records"):
141
+ key = (str(record.get("repository", "")), str(record.get("date", "")))
142
+ existing_data[key] = record
123
143
  except Exception as exc:
124
144
  logger.warning(f"Could not read existing CSV '{csv_path}': {exc}. Starting fresh.")
125
145
 
@@ -132,24 +152,32 @@ def run_sync_cycle(token: str, repo_names=None, data_dir="./data", output_mode="
132
152
 
133
153
  # Merge fresh data into existing rows — preserves columns not present in this sync run
134
154
  new_records_added = 0
135
- for _, row in df.iterrows():
136
- key = (str(row["repository"]), str(row["date"]))
155
+ for record in df.to_dict("records"):
156
+ key = (str(record.get("repository", "")), str(record.get("date", "")))
137
157
  if key not in existing_data:
138
158
  new_records_added += 1
139
- existing_data[key] = row.to_dict()
159
+ existing_data[key] = record
140
160
  else:
141
- existing_data[key].update(row.to_dict())
161
+ existing_data[key].update(record)
142
162
 
143
163
  # Sort all rows by date and repo name before writing back to disk
144
164
  final_rows = []
145
165
  for v in existing_data.values():
146
166
  # Fill any missing schema columns with empty strings for old rows
147
- clean_row = {k: v.get(k, "") for k in existing_fields}
167
+ clean_row = {}
168
+ for k in existing_fields:
169
+ nk = _normalize_field(k)
170
+ if nk in v:
171
+ clean_row[k] = v[nk]
172
+ else:
173
+ # Try to match by normalized name to old-style keys in the dict.
174
+ match = next((orig for orig in v.keys() if _normalize_field(orig) == nk), None)
175
+ clean_row[k] = v[match] if match is not None else ""
148
176
  final_rows.append(clean_row)
149
177
 
150
- final_rows.sort(key=lambda x: (x.get("date", ""), x.get("repository", "")))
178
+ final_rows.sort(key=lambda x: (str(x.get("date", "")), str(x.get("repository", ""))))
151
179
 
152
- # Write everything back to the CSV file atomically
180
+ # Write everything back to the CSV file
153
181
  with open(csv_path, "w", newline="", encoding="utf-8") as f:
154
182
  writer = csv.DictWriter(f, fieldnames=existing_fields, extrasaction='ignore')
155
183
  writer.writeheader()
@@ -157,10 +185,15 @@ def run_sync_cycle(token: str, repo_names=None, data_dir="./data", output_mode="
157
185
 
158
186
  logger.info(f"Successfully processed traffic data. Added {new_records_added} new daily records to {csv_path}")
159
187
 
160
- # If the user wants a JSON export, regenerate it from the full database
161
- if export_json:
162
- export_json_database(data_dir, export_json, export_public_only)
163
- logger.info(f"Exported historical database to {export_json}")
188
+
189
+ def _interruptible_sleep(total_seconds: float) -> None:
190
+ # Sleep in short chunks so SIGINT/SIGTERM (Ctrl-C, docker stop) is responsive.
191
+ chunk = 1.0
192
+ elapsed = 0.0
193
+ while elapsed < total_seconds:
194
+ remaining = total_seconds - elapsed
195
+ time.sleep(min(chunk, remaining))
196
+ elapsed += chunk
164
197
 
165
198
 
166
199
  def run_sync(token: str, repo_names=None, data_dir="./data", output_mode="monthly", schedule_cron=None, export_json=None, export_public_only=True, metrics: list = None):
@@ -173,7 +206,7 @@ def run_sync(token: str, repo_names=None, data_dir="./data", output_mode="monthl
173
206
  # Parse the cron expression — fail early if the format is wrong
174
207
  try:
175
208
  croniter(schedule_cron, datetime.now(timezone.utc)) # validate only
176
- except ValueError as e:
209
+ except (ValueError, KeyError, TypeError) as e:
177
210
  logger.error(f"Invalid cron expression: {e}")
178
211
  return
179
212
 
@@ -190,21 +223,25 @@ def run_sync(token: str, repo_names=None, data_dir="./data", output_mode="monthl
190
223
 
191
224
  sleep_secs = (next_run - now_utc).total_seconds()
192
225
 
193
- # Sleep until the next scheduled run
194
- if sleep_secs > 0:
195
- logger.info(f"Scheduled next sync for {next_run.strftime('%Y-%m-%d %H:%M:%S UTC')}. Sleeping...")
196
- time.sleep(sleep_secs)
226
+ # Guard against negative/zero sleep (clock skew): minimum 1s to avoid hot-looping.
227
+ if sleep_secs <= 0:
228
+ sleep_secs = 1.0
229
+
230
+ logger.info(f"Scheduled next sync for {next_run.strftime('%Y-%m-%d %H:%M:%S UTC')}. Sleeping {sleep_secs:.0f}s...")
231
+ _interruptible_sleep(sleep_secs)
197
232
 
198
233
  try:
199
234
  # Always re-validate the token before fetching to catch expiry early
200
235
  is_valid, msg = validate_token(token)
201
236
  if not is_valid:
202
237
  # Distinguish between a dead token (stop forever) and a network blip (retry next cycle)
238
+ msg_lower = msg.lower()
203
239
  is_auth_failure = (
204
240
  "401" in msg
205
- or "authentication failed" in msg.lower()
206
- or "invalid token" in msg.lower()
207
- or "revoked" in msg.lower()
241
+ or "authentication failed" in msg_lower
242
+ or "invalid token" in msg_lower
243
+ or "revoked" in msg_lower
244
+ or "bad credentials" in msg_lower
208
245
  )
209
246
  if is_auth_failure:
210
247
  logger.critical(
@@ -217,6 +254,11 @@ def run_sync(token: str, repo_names=None, data_dir="./data", output_mode="monthl
217
254
  continue
218
255
 
219
256
  run_sync_cycle(token, repo_names, data_dir, output_mode, export_json, export_public_only, metrics)
257
+ except SystemExit:
258
+ raise
259
+ except KeyboardInterrupt:
260
+ logger.info("Daemon interrupted by user. Exiting.")
261
+ raise
220
262
  except Exception as e:
221
263
  # Don't let a single bad cycle kill the daemon — just log and carry on
222
264
  logger.error(f"Daemon encountered unexpected error: {e}. Recovering for next cycle.")
@@ -6,7 +6,6 @@ Achieves 100% feature parity with the Python API.
6
6
  import argparse
7
7
  import sys
8
8
  import os
9
- import json
10
9
 
11
10
  # Import the three public functions that power every subcommand
12
11
  from gitlytics import fetch_traffic, sync, serve_dashboard
@@ -16,7 +15,14 @@ def parse_repo_names(repo_arg: str):
16
15
  # Turn "owner/repo1, owner/repo2" into ["owner/repo1", "owner/repo2"], or None if empty
17
16
  if not repo_arg:
18
17
  return None
19
- return [r.strip() for r in repo_arg.split(",")]
18
+ names = [r.strip() for r in repo_arg.split(",")]
19
+ # Validate the format up front so the user gets a clear error instead of a 404.
20
+ for name in names:
21
+ if "/" not in name:
22
+ raise argparse.ArgumentTypeError(
23
+ f"Invalid repo name {name!r}: expected 'owner/repo' format."
24
+ )
25
+ return names
20
26
 
21
27
 
22
28
  def main():
@@ -72,13 +78,14 @@ def main():
72
78
 
73
79
  args = parser.parse_args()
74
80
 
75
- # Print help and exit if the user didn't give a subcommand
81
+ # Print help and exit cleanly (code 0) if the user didn't give a subcommand.
76
82
  if not args.command:
77
83
  parser.print_help()
78
- sys.exit(1)
84
+ sys.exit(0)
79
85
 
80
86
  # Resolve token: CLI flag wins, then environment variables
81
- token = getattr(args, "token", None) or os.environ.get("GITLYTICS_TOKEN") or os.environ.get("GITHUB_TOKEN")
87
+ raw_token = getattr(args, "token", None)
88
+ token = (raw_token or os.environ.get("GITLYTICS_TOKEN") or os.environ.get("GITHUB_TOKEN") or "").strip() or None
82
89
 
83
90
  # fetch and sync both need a token — bail early with a clear message
84
91
  if args.command in ["fetch", "sync"] and not token:
@@ -86,31 +93,56 @@ def main():
86
93
  sys.exit(1)
87
94
 
88
95
  if args.command == "fetch":
89
- repos = parse_repo_names(args.repo_name)
90
- result = fetch_traffic(
91
- token=token,
92
- repo_name=repos,
93
- print_table=args.print_table,
94
- return_format=args.return_format,
95
- save_file=args.save_file,
96
- metrics=args.metrics
97
- )
96
+ try:
97
+ repos = parse_repo_names(args.repo_name)
98
+ except argparse.ArgumentTypeError as exc:
99
+ print(f"❌ Error: {exc}")
100
+ sys.exit(2)
101
+ try:
102
+ result = fetch_traffic(
103
+ token=token,
104
+ repo_name=repos,
105
+ print_table=args.print_table,
106
+ return_format=args.return_format,
107
+ save_file=args.save_file,
108
+ metrics=args.metrics
109
+ )
110
+ except ValueError as exc:
111
+ print(f"❌ Error: {exc}")
112
+ sys.exit(2)
98
113
  # Give the user a hint if they didn't ask for any output
99
114
  if not args.print_table and not args.save_file:
100
- print("Fetch successful. Use --print-table or --save-file to see results.")
115
+ try:
116
+ import pandas as pd
117
+ if isinstance(result, pd.DataFrame):
118
+ print(f"Fetch successful. {len(result)} row(s) across {result['repository'].nunique() if not result.empty else 0} repo(s).")
119
+ print("First 5 rows:")
120
+ print(result.head().to_string(index=False))
121
+ else:
122
+ print(f"Fetch successful. Use --print-table or --save-file to see results.")
123
+ except Exception:
124
+ print("Fetch successful. Use --print-table or --save-file to see results.")
101
125
 
102
126
  elif args.command == "sync":
103
- repos = parse_repo_names(args.repo_name)
104
- sync(
105
- token=token,
106
- repo_name=repos,
107
- data_dir=args.data_dir,
108
- output_mode=args.output_mode,
109
- schedule_cron=args.schedule_cron,
110
- export_json=args.export_json,
111
- export_public_only=args.export_public_only,
112
- metrics=args.metrics
113
- )
127
+ try:
128
+ repos = parse_repo_names(args.repo_name)
129
+ except argparse.ArgumentTypeError as exc:
130
+ print(f"❌ Error: {exc}")
131
+ sys.exit(2)
132
+ try:
133
+ sync(
134
+ token=token,
135
+ repo_name=repos,
136
+ data_dir=args.data_dir,
137
+ output_mode=args.output_mode,
138
+ schedule_cron=args.schedule_cron,
139
+ export_json=args.export_json,
140
+ export_public_only=args.export_public_only,
141
+ metrics=args.metrics
142
+ )
143
+ except ValueError as exc:
144
+ print(f"❌ Error: {exc}")
145
+ sys.exit(2)
114
146
 
115
147
  elif args.command == "dashboard":
116
148
  print(f"\n[Gitlytics] Starting Gitlytics Dashboard on http://{args.host}:{args.port}\n")