gharc 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gharc/__init__.py +19 -0
- gharc/cli.py +48 -0
- gharc/filters.py +29 -0
- gharc/storage.py +136 -0
- gharc/streamer.py +249 -0
- gharc/utils.py +53 -0
- gharc-0.1.0.dist-info/METADATA +261 -0
- gharc-0.1.0.dist-info/RECORD +12 -0
- gharc-0.1.0.dist-info/WHEEL +5 -0
- gharc-0.1.0.dist-info/entry_points.txt +2 -0
- gharc-0.1.0.dist-info/licenses/LICENSE +21 -0
- gharc-0.1.0.dist-info/top_level.txt +1 -0
gharc/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
|
2
|
+
|
|
3
|
+
from .filters import passes_filters, fast_string_check
|
|
4
|
+
from .storage import DataWriter, jsonl_to_parquet
|
|
5
|
+
from .streamer import process_range
|
|
6
|
+
from .utils import parse_date, date_range, get_url_for_time, setup_logging
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"__version__",
|
|
10
|
+
"passes_filters",
|
|
11
|
+
"fast_string_check",
|
|
12
|
+
"DataWriter",
|
|
13
|
+
"jsonl_to_parquet",
|
|
14
|
+
"process_range",
|
|
15
|
+
"parse_date",
|
|
16
|
+
"date_range",
|
|
17
|
+
"get_url_for_time",
|
|
18
|
+
"setup_logging",
|
|
19
|
+
]
|
gharc/cli.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# src/gharc/cli.py
|
|
2
|
+
import click
|
|
3
|
+
import sys
|
|
4
|
+
from .utils import parse_date, logger, setup_logging
|
|
5
|
+
from .streamer import process_range
|
|
6
|
+
from .storage import jsonl_to_parquet
|
|
7
|
+
|
|
8
|
+
@click.group()
|
|
9
|
+
def main():
|
|
10
|
+
"""gharc: Stream-filter GitHub Archive data."""
|
|
11
|
+
setup_logging()
|
|
12
|
+
|
|
13
|
+
@main.command()
|
|
14
|
+
@click.option('--start', required=True, help='Start date (YYYY-MM-DD-HH)')
|
|
15
|
+
@click.option('--end', required=True, help='End date (YYYY-MM-DD-HH)')
|
|
16
|
+
@click.option('--repos', help='Comma-separated repos (e.g. apache/spark)')
|
|
17
|
+
@click.option('--event-types', help='Comma-separated events (e.g. PushEvent)')
|
|
18
|
+
@click.option('--output', default='filtered.jsonl', help='Output file')
|
|
19
|
+
@click.option('--workers', default=4, help='Parallel downloads')
|
|
20
|
+
def download(start, end, repos, event_types, output, workers):
|
|
21
|
+
try:
|
|
22
|
+
s_dt = parse_date(start)
|
|
23
|
+
e_dt = parse_date(end)
|
|
24
|
+
repo_list = [r.strip() for r in repos.split(',')] if repos else None
|
|
25
|
+
type_list = [t.strip() for t in event_types.split(',')] if event_types else None
|
|
26
|
+
|
|
27
|
+
process_range(s_dt, e_dt, repo_list, type_list, output, workers)
|
|
28
|
+
|
|
29
|
+
except Exception as e:
|
|
30
|
+
logger.error(str(e))
|
|
31
|
+
sys.exit(1)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@main.command()
|
|
35
|
+
@click.argument('input_path', type=click.Path(exists=True, dir_okay=False))
|
|
36
|
+
@click.argument('output_path', type=click.Path(dir_okay=False))
|
|
37
|
+
@click.option('--batch-size', default=10000, help='Rows per Parquet row group')
|
|
38
|
+
def convert(input_path, output_path, batch_size):
|
|
39
|
+
"""Convert a JSONL output from `gharc download` into a single Parquet file."""
|
|
40
|
+
try:
|
|
41
|
+
jsonl_to_parquet(input_path, output_path, batch_size=batch_size)
|
|
42
|
+
except Exception as e:
|
|
43
|
+
logger.error(str(e))
|
|
44
|
+
sys.exit(1)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
if __name__ == "__main__":
|
|
48
|
+
main()
|
gharc/filters.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# src/gharc/filters.py
|
|
2
|
+
import json
|
|
3
|
+
|
|
4
|
+
def passes_filters(event_dict: dict, repos: list = None, event_types: list = None) -> bool:
|
|
5
|
+
"""
|
|
6
|
+
Returns True if the event matches ANY of the provided filters.
|
|
7
|
+
If a filter list is None, it is ignored (pass-through).
|
|
8
|
+
"""
|
|
9
|
+
# 1. Filter by Repo Name
|
|
10
|
+
if repos:
|
|
11
|
+
repo_name = event_dict.get('repo', {}).get('name')
|
|
12
|
+
if repo_name not in repos:
|
|
13
|
+
return False
|
|
14
|
+
|
|
15
|
+
# 2. Filter by Event Type
|
|
16
|
+
if event_types:
|
|
17
|
+
if event_dict.get('type') not in event_types:
|
|
18
|
+
return False
|
|
19
|
+
|
|
20
|
+
return True
|
|
21
|
+
|
|
22
|
+
def fast_string_check(line: str, tokens: list) -> bool:
|
|
23
|
+
"""
|
|
24
|
+
Optimization: Returns False if NONE of the tokens appear in the string.
|
|
25
|
+
Avoids expensive JSON parsing for lines that definitely don't match.
|
|
26
|
+
"""
|
|
27
|
+
if not tokens:
|
|
28
|
+
return True
|
|
29
|
+
return any(t in line for t in tokens)
|
gharc/storage.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
# src/gharc/storage.py
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import pyarrow as pa
|
|
6
|
+
import pyarrow.parquet as pq
|
|
7
|
+
from .utils import logger
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DataWriter:
|
|
11
|
+
def __init__(self, filename: str, append: bool = False):
|
|
12
|
+
self.filename = filename
|
|
13
|
+
self.is_parquet = filename.endswith('.parquet')
|
|
14
|
+
self.buffer = []
|
|
15
|
+
self.buffer_size = 10000
|
|
16
|
+
self._pq_writer = None
|
|
17
|
+
|
|
18
|
+
if append and self.is_parquet and os.path.exists(self.filename):
|
|
19
|
+
# ParquetWriter cannot append to a closed Parquet file. For long
|
|
20
|
+
# crash-safe runs use JSONL; convert to Parquet at the end.
|
|
21
|
+
raise ValueError(
|
|
22
|
+
f"Cannot resume into existing Parquet file {filename}. "
|
|
23
|
+
f"Use JSONL output for resumable runs and convert at the end."
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
if not append and os.path.exists(self.filename):
|
|
27
|
+
os.remove(self.filename)
|
|
28
|
+
|
|
29
|
+
def write(self, record: dict):
|
|
30
|
+
self.buffer.append(record)
|
|
31
|
+
if len(self.buffer) >= self.buffer_size:
|
|
32
|
+
self.flush()
|
|
33
|
+
|
|
34
|
+
def flush(self):
|
|
35
|
+
if not self.buffer:
|
|
36
|
+
return
|
|
37
|
+
|
|
38
|
+
if self.is_parquet:
|
|
39
|
+
rows = [_flatten_event(e) for e in self.buffer]
|
|
40
|
+
df = pd.DataFrame(rows)
|
|
41
|
+
table = pa.Table.from_pandas(df, preserve_index=False)
|
|
42
|
+
|
|
43
|
+
if self._pq_writer is None:
|
|
44
|
+
self._pq_writer = pq.ParquetWriter(
|
|
45
|
+
self.filename,
|
|
46
|
+
schema=table.schema,
|
|
47
|
+
compression='snappy',
|
|
48
|
+
)
|
|
49
|
+
else:
|
|
50
|
+
# Cast to the schema we opened with; event payloads vary in shape.
|
|
51
|
+
table = table.cast(self._pq_writer.schema, safe=False)
|
|
52
|
+
|
|
53
|
+
self._pq_writer.write_table(table)
|
|
54
|
+
else:
|
|
55
|
+
with open(self.filename, 'a', encoding='utf-8') as f:
|
|
56
|
+
for rec in self.buffer:
|
|
57
|
+
f.write(json.dumps(rec) + '\n')
|
|
58
|
+
|
|
59
|
+
self.buffer = []
|
|
60
|
+
|
|
61
|
+
def close(self):
|
|
62
|
+
self.flush()
|
|
63
|
+
if self._pq_writer is not None:
|
|
64
|
+
self._pq_writer.close()
|
|
65
|
+
self._pq_writer = None
|
|
66
|
+
logger.info(f"Wrote output to {self.filename}")
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _flatten_event(event: dict) -> dict:
|
|
70
|
+
# JSON-stringify nested fields so Parquet sees a stable flat schema.
|
|
71
|
+
out = {}
|
|
72
|
+
for key, value in event.items():
|
|
73
|
+
if isinstance(value, (dict, list)):
|
|
74
|
+
out[key] = json.dumps(value, ensure_ascii=False)
|
|
75
|
+
else:
|
|
76
|
+
out[key] = value
|
|
77
|
+
return out
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def jsonl_to_parquet(input_path: str, output_path: str, batch_size: int = 10000) -> int:
|
|
81
|
+
"""Stream a JSONL file into a single Parquet file.
|
|
82
|
+
|
|
83
|
+
Reads `input_path` line by line, batches into Parquet row groups of up to
|
|
84
|
+
`batch_size` rows, and writes to `output_path`. Returns the number of rows
|
|
85
|
+
written. Designed to handle multi-GB inputs without loading the whole file
|
|
86
|
+
into memory.
|
|
87
|
+
"""
|
|
88
|
+
if os.path.exists(output_path):
|
|
89
|
+
os.remove(output_path)
|
|
90
|
+
|
|
91
|
+
writer = None
|
|
92
|
+
buffer = []
|
|
93
|
+
rows_written = 0
|
|
94
|
+
|
|
95
|
+
def flush():
|
|
96
|
+
nonlocal writer
|
|
97
|
+
if not buffer:
|
|
98
|
+
return
|
|
99
|
+
rows = [_flatten_event(e) for e in buffer]
|
|
100
|
+
df = pd.DataFrame(rows)
|
|
101
|
+
table = pa.Table.from_pandas(df, preserve_index=False)
|
|
102
|
+
if writer is None:
|
|
103
|
+
writer = pq.ParquetWriter(
|
|
104
|
+
output_path,
|
|
105
|
+
schema=table.schema,
|
|
106
|
+
compression='snappy',
|
|
107
|
+
)
|
|
108
|
+
else:
|
|
109
|
+
table = table.cast(writer.schema, safe=False)
|
|
110
|
+
writer.write_table(table)
|
|
111
|
+
|
|
112
|
+
try:
|
|
113
|
+
with open(input_path, 'r', encoding='utf-8') as f:
|
|
114
|
+
for line in f:
|
|
115
|
+
line = line.strip()
|
|
116
|
+
if not line:
|
|
117
|
+
continue
|
|
118
|
+
try:
|
|
119
|
+
buffer.append(json.loads(line))
|
|
120
|
+
except json.JSONDecodeError:
|
|
121
|
+
logger.warning(f"Skipping malformed JSON line in {input_path}")
|
|
122
|
+
continue
|
|
123
|
+
if len(buffer) >= batch_size:
|
|
124
|
+
flush()
|
|
125
|
+
rows_written += len(buffer)
|
|
126
|
+
buffer.clear()
|
|
127
|
+
|
|
128
|
+
if buffer:
|
|
129
|
+
flush()
|
|
130
|
+
rows_written += len(buffer)
|
|
131
|
+
finally:
|
|
132
|
+
if writer is not None:
|
|
133
|
+
writer.close()
|
|
134
|
+
|
|
135
|
+
logger.info(f"Converted {rows_written:,} rows from {input_path} to {output_path}")
|
|
136
|
+
return rows_written
|
gharc/streamer.py
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
# src/gharc/streamer.py
|
|
2
|
+
import requests
|
|
3
|
+
import gzip
|
|
4
|
+
import json
|
|
5
|
+
import concurrent.futures
|
|
6
|
+
import threading
|
|
7
|
+
import time
|
|
8
|
+
import os
|
|
9
|
+
import tempfile
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
from requests.adapters import HTTPAdapter
|
|
12
|
+
from urllib3.util.retry import Retry
|
|
13
|
+
from tqdm import tqdm
|
|
14
|
+
from .utils import get_url_for_time, date_range, logger
|
|
15
|
+
from .filters import passes_filters, fast_string_check
|
|
16
|
+
from .storage import DataWriter
|
|
17
|
+
|
|
18
|
+
# Use orjson if available for 3-5x faster parsing
|
|
19
|
+
try:
|
|
20
|
+
import orjson
|
|
21
|
+
HAS_ORJSON = True
|
|
22
|
+
except ImportError:
|
|
23
|
+
HAS_ORJSON = False
|
|
24
|
+
|
|
25
|
+
_thread_local = threading.local()
|
|
26
|
+
|
|
27
|
+
def get_robust_session():
|
|
28
|
+
"""Creates a requests session with retry logic."""
|
|
29
|
+
session = requests.Session()
|
|
30
|
+
retry_strategy = Retry(
|
|
31
|
+
total=5,
|
|
32
|
+
backoff_factor=1,
|
|
33
|
+
status_forcelist=[429, 500, 502, 503, 504],
|
|
34
|
+
allowed_methods=["HEAD", "GET", "OPTIONS"]
|
|
35
|
+
)
|
|
36
|
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
|
37
|
+
session.mount("https://", adapter)
|
|
38
|
+
session.mount("http://", adapter)
|
|
39
|
+
return session
|
|
40
|
+
|
|
41
|
+
def _session_for_thread() -> requests.Session:
|
|
42
|
+
# One session per worker thread so connection pooling actually kicks in.
|
|
43
|
+
session = getattr(_thread_local, "session", None)
|
|
44
|
+
if session is None:
|
|
45
|
+
session = get_robust_session()
|
|
46
|
+
_thread_local.session = session
|
|
47
|
+
return session
|
|
48
|
+
|
|
49
|
+
def download_resumable(url: str, temp_path: str, session: requests.Session) -> bool:
|
|
50
|
+
"""
|
|
51
|
+
Downloads a file with resume capability.
|
|
52
|
+
"""
|
|
53
|
+
resume_header = {}
|
|
54
|
+
mode = 'wb'
|
|
55
|
+
if os.path.exists(temp_path):
|
|
56
|
+
current_size = os.path.getsize(temp_path)
|
|
57
|
+
if current_size > 0:
|
|
58
|
+
resume_header = {'Range': f'bytes={current_size}-'}
|
|
59
|
+
mode = 'ab'
|
|
60
|
+
# Only log resume if it's significant to keep bar clean
|
|
61
|
+
if current_size > 1024 * 1024:
|
|
62
|
+
tqdm.write(f" ↳ Resuming from {current_size/(1024*1024):.1f} MB")
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
with session.get(url, headers=resume_header, stream=True, timeout=(30, 120)) as r:
|
|
66
|
+
if r.status_code == 416: # Range not satisfiable (file done)
|
|
67
|
+
return True
|
|
68
|
+
if r.status_code not in [200, 206]:
|
|
69
|
+
logger.debug(f"HTTP {r.status_code} for {url}")
|
|
70
|
+
return False
|
|
71
|
+
|
|
72
|
+
if r.status_code == 200 and mode == 'ab':
|
|
73
|
+
mode = 'wb'
|
|
74
|
+
|
|
75
|
+
with open(temp_path, mode) as f:
|
|
76
|
+
for chunk in r.iter_content(chunk_size=65536):
|
|
77
|
+
if chunk:
|
|
78
|
+
f.write(chunk)
|
|
79
|
+
return True
|
|
80
|
+
except Exception as e:
|
|
81
|
+
logger.debug(f"Download attempt failed for {url}: {e}")
|
|
82
|
+
return False
|
|
83
|
+
|
|
84
|
+
def process_single_hour(dt: datetime, repos: list, event_types: list) -> list:
|
|
85
|
+
"""
|
|
86
|
+
Downloads with resume -> Process -> Delete.
|
|
87
|
+
"""
|
|
88
|
+
url = get_url_for_time(dt)
|
|
89
|
+
results = []
|
|
90
|
+
# Convert filters to bytes if using orjson for speed
|
|
91
|
+
if HAS_ORJSON:
|
|
92
|
+
fast_tokens = [t.encode('utf-8') for t in ((repos if repos else []) + (event_types if event_types else []))]
|
|
93
|
+
else:
|
|
94
|
+
fast_tokens = (repos if repos else []) + (event_types if event_types else [])
|
|
95
|
+
|
|
96
|
+
session = _session_for_thread()
|
|
97
|
+
|
|
98
|
+
fd, temp_path = tempfile.mkstemp(suffix=".json.gz")
|
|
99
|
+
os.close(fd)
|
|
100
|
+
|
|
101
|
+
download_success = False
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
for attempt in range(10):
|
|
105
|
+
if download_resumable(url, temp_path, session):
|
|
106
|
+
download_success = True
|
|
107
|
+
break
|
|
108
|
+
time.sleep(2)
|
|
109
|
+
|
|
110
|
+
if not download_success:
|
|
111
|
+
tqdm.write(f"Failed to download {url} after 10 attempts (run with debug logging for details)")
|
|
112
|
+
return []
|
|
113
|
+
|
|
114
|
+
try:
|
|
115
|
+
with gzip.open(temp_path, 'rb') as f:
|
|
116
|
+
for line in f:
|
|
117
|
+
try:
|
|
118
|
+
# OPTIMIZATION: Check tokens before full parse
|
|
119
|
+
# orjson returns bytes, so we don't even need to decode to utf-8 yet
|
|
120
|
+
if HAS_ORJSON:
|
|
121
|
+
if fast_tokens:
|
|
122
|
+
# Simple byte-level check (very fast)
|
|
123
|
+
if not any(t in line for t in fast_tokens):
|
|
124
|
+
continue
|
|
125
|
+
event = orjson.loads(line)
|
|
126
|
+
else:
|
|
127
|
+
# Standard Fallback
|
|
128
|
+
decoded = line.decode('utf-8')
|
|
129
|
+
if fast_tokens and not fast_string_check(decoded, fast_tokens):
|
|
130
|
+
continue
|
|
131
|
+
event = json.loads(decoded)
|
|
132
|
+
|
|
133
|
+
if passes_filters(event, repos, event_types):
|
|
134
|
+
results.append(event)
|
|
135
|
+
except Exception:
|
|
136
|
+
# GHArchive occasionally has malformed lines; expected at low rates.
|
|
137
|
+
continue
|
|
138
|
+
except Exception as e:
|
|
139
|
+
tqdm.write(f"Error reading gzip for {url}: {e}")
|
|
140
|
+
|
|
141
|
+
return results
|
|
142
|
+
|
|
143
|
+
finally:
|
|
144
|
+
if os.path.exists(temp_path):
|
|
145
|
+
os.remove(temp_path)
|
|
146
|
+
|
|
147
|
+
class _RunState:
|
|
148
|
+
"""Tracks which hours have been completed so a crashed run can resume.
|
|
149
|
+
|
|
150
|
+
State lives next to the output file as <output>.state.json. The fingerprint
|
|
151
|
+
of the run (window + filters) is stored alongside the done-hour list, so
|
|
152
|
+
re-running with different filters against the same output triggers a clear
|
|
153
|
+
error rather than silently mixing data.
|
|
154
|
+
"""
|
|
155
|
+
|
|
156
|
+
def __init__(self, output_path, fingerprint):
|
|
157
|
+
self._path = str(output_path) + ".state.json"
|
|
158
|
+
self._fingerprint = fingerprint
|
|
159
|
+
self._done = set()
|
|
160
|
+
if os.path.exists(self._path):
|
|
161
|
+
try:
|
|
162
|
+
with open(self._path, "r", encoding="utf-8") as f:
|
|
163
|
+
payload = json.load(f)
|
|
164
|
+
except (json.JSONDecodeError, OSError):
|
|
165
|
+
logger.warning(f"State file {self._path} unreadable, starting fresh")
|
|
166
|
+
return
|
|
167
|
+
if payload.get("fingerprint") != fingerprint:
|
|
168
|
+
raise ValueError(
|
|
169
|
+
f"State file {self._path} was written for a different run "
|
|
170
|
+
f"(window or filters changed). Remove it or use a new --output."
|
|
171
|
+
)
|
|
172
|
+
self._done = set(payload.get("done_hours", []))
|
|
173
|
+
|
|
174
|
+
def is_done(self, ts):
|
|
175
|
+
return ts.isoformat() in self._done
|
|
176
|
+
|
|
177
|
+
def mark_done(self, ts):
|
|
178
|
+
self._done.add(ts.isoformat())
|
|
179
|
+
with open(self._path, "w", encoding="utf-8") as f:
|
|
180
|
+
json.dump(
|
|
181
|
+
{"fingerprint": self._fingerprint, "done_hours": sorted(self._done)},
|
|
182
|
+
f,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
def clear(self):
|
|
186
|
+
if os.path.exists(self._path):
|
|
187
|
+
os.remove(self._path)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _run_fingerprint(start, end, repos, event_types):
|
|
191
|
+
return {
|
|
192
|
+
"start": start.isoformat(),
|
|
193
|
+
"end": end.isoformat(),
|
|
194
|
+
"repos": sorted(repos) if repos else None,
|
|
195
|
+
"event_types": sorted(event_types) if event_types else None,
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def process_range(start, end, repos, event_types, output, workers):
|
|
200
|
+
fingerprint = _run_fingerprint(start, end, repos, event_types)
|
|
201
|
+
state = _RunState(output, fingerprint)
|
|
202
|
+
|
|
203
|
+
resuming = bool(state._done)
|
|
204
|
+
writer = DataWriter(output, append=resuming)
|
|
205
|
+
all_timestamps = list(date_range(start, end))
|
|
206
|
+
todo = [t for t in all_timestamps if not state.is_done(t)]
|
|
207
|
+
skipped = len(all_timestamps) - len(todo)
|
|
208
|
+
if skipped:
|
|
209
|
+
logger.info(f"Resuming: skipping {skipped} hours already in state file")
|
|
210
|
+
|
|
211
|
+
if not todo:
|
|
212
|
+
writer.close()
|
|
213
|
+
logger.info(f"Nothing to do; output already complete at {output}")
|
|
214
|
+
return
|
|
215
|
+
|
|
216
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
|
|
217
|
+
future_to_time = {
|
|
218
|
+
executor.submit(process_single_hour, ts, repos, event_types): ts
|
|
219
|
+
for ts in todo
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
with tqdm(
|
|
223
|
+
total=len(todo),
|
|
224
|
+
desc="Processing",
|
|
225
|
+
unit="hr",
|
|
226
|
+
smoothing=0,
|
|
227
|
+
dynamic_ncols=True,
|
|
228
|
+
) as pbar:
|
|
229
|
+
|
|
230
|
+
for future in concurrent.futures.as_completed(future_to_time):
|
|
231
|
+
ts = future_to_time[future]
|
|
232
|
+
try:
|
|
233
|
+
data = future.result()
|
|
234
|
+
if data:
|
|
235
|
+
for record in data:
|
|
236
|
+
writer.write(record)
|
|
237
|
+
# Flush so this hour is durable on disk before we mark it
|
|
238
|
+
# done. If the process crashes after mark_done, restart
|
|
239
|
+
# skips this hour and the data is already written.
|
|
240
|
+
writer.flush()
|
|
241
|
+
state.mark_done(ts)
|
|
242
|
+
except Exception as exc:
|
|
243
|
+
tqdm.write(f"Worker exception for {ts}: {exc}")
|
|
244
|
+
finally:
|
|
245
|
+
pbar.update(1)
|
|
246
|
+
|
|
247
|
+
writer.close()
|
|
248
|
+
state.clear()
|
|
249
|
+
logger.info(f"Done! Data written to {output}")
|
gharc/utils.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# src/gharc/utils.py
|
|
2
|
+
import logging
|
|
3
|
+
from datetime import datetime, timedelta
|
|
4
|
+
from typing import Iterator
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger("gharc")
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def setup_logging(level: int = logging.INFO) -> None:
|
|
10
|
+
"""Attach a console handler to the gharc logger. Safe to call twice."""
|
|
11
|
+
if logger.handlers:
|
|
12
|
+
return
|
|
13
|
+
handler = logging.StreamHandler()
|
|
14
|
+
handler.setFormatter(logging.Formatter(
|
|
15
|
+
'[%(asctime)s] %(levelname)s: %(message)s',
|
|
16
|
+
datefmt='%H:%M:%S',
|
|
17
|
+
))
|
|
18
|
+
logger.addHandler(handler)
|
|
19
|
+
logger.setLevel(level)
|
|
20
|
+
|
|
21
|
+
def parse_date(date_str: str) -> datetime:
|
|
22
|
+
"""Parses YYYY-MM-DD or YYYY-MM-DD-HH"""
|
|
23
|
+
try:
|
|
24
|
+
if len(date_str.split('-')) == 3:
|
|
25
|
+
return datetime.strptime(date_str, "%Y-%m-%d")
|
|
26
|
+
else:
|
|
27
|
+
return datetime.strptime(date_str, "%Y-%m-%d-%H")
|
|
28
|
+
except ValueError:
|
|
29
|
+
raise ValueError(f"Invalid date format: {date_str}. Use YYYY-MM-DD or YYYY-MM-DD-HH")
|
|
30
|
+
|
|
31
|
+
def date_range(start: datetime, end: datetime) -> Iterator[datetime]:
|
|
32
|
+
"""Yields hourly datetimes in [start, end). End is exclusive.
|
|
33
|
+
|
|
34
|
+
Use --end 2024-02-01 to cover all of January 2024.
|
|
35
|
+
"""
|
|
36
|
+
current = start.replace(minute=0, second=0, microsecond=0)
|
|
37
|
+
end_rounded = end.replace(minute=0, second=0, microsecond=0)
|
|
38
|
+
|
|
39
|
+
while current < end_rounded:
|
|
40
|
+
yield current
|
|
41
|
+
current += timedelta(hours=1)
|
|
42
|
+
|
|
43
|
+
def get_url_for_time(dt: datetime) -> str:
|
|
44
|
+
"""
|
|
45
|
+
Constructs the GHArchive URL for a specific hour.
|
|
46
|
+
Example: 2024-01-01 15:00 -> https://data.gharchive.org/2024-01-01-15.json.gz
|
|
47
|
+
"""
|
|
48
|
+
# GHArchive uses 24-hour format without leading zeros for hours 0-9?
|
|
49
|
+
# Actually checking standard GHArchive urls: 2024-01-01-1.json.gz or 01.json.gz?
|
|
50
|
+
# GHArchive documentation says: {year}-{month}-{day}-{hour}.json.gz
|
|
51
|
+
# Hour is usually 0-23 (no leading zero required by spec, but usually provided).
|
|
52
|
+
# Let's use simple integer formatting which works for their redirects.
|
|
53
|
+
return f"https://data.gharchive.org/{dt.year}-{dt.month:02d}-{dt.day:02d}-{dt.hour}.json.gz"
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gharc
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A stream-processing tool for GitHub Archive data filtering.
|
|
5
|
+
Author-email: Arav Panwar <aravpanwar@outlook.com>
|
|
6
|
+
Project-URL: Homepage, https://github.com/aravpanwar/gharc
|
|
7
|
+
Project-URL: Bug Tracker, https://github.com/aravpanwar/gharc/issues
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
12
|
+
Requires-Python: >=3.8
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
Requires-Dist: requests>=2.28.0
|
|
16
|
+
Requires-Dist: click>=8.0.0
|
|
17
|
+
Requires-Dist: pandas>=2.0.0
|
|
18
|
+
Requires-Dist: pyarrow>=12.0.0
|
|
19
|
+
Requires-Dist: tqdm>=4.65.0
|
|
20
|
+
Provides-Extra: fast
|
|
21
|
+
Requires-Dist: orjson>=3.9.0; extra == "fast"
|
|
22
|
+
Provides-Extra: test
|
|
23
|
+
Requires-Dist: pytest>=7.0; extra == "test"
|
|
24
|
+
Dynamic: license-file
|
|
25
|
+
|
|
26
|
+
# gharc: GitHub Archive Stream-Processor
|
|
27
|
+
|
|
28
|
+
[](https://opensource.org/licenses/MIT)
|
|
29
|
+
[](https://github.com/aravpanwar/gharc/actions)
|
|
30
|
+
[](https://www.python.org/downloads/)
|
|
31
|
+
[](https://github.com/psf/black)
|
|
32
|
+
[](https://doi.org/10.5281/zenodo.19814232)
|
|
33
|
+
|
|
34
|
+
**Mine the GitHub Archive on a standard laptop.**
|
|
35
|
+
|
|
36
|
+
`gharc` is a command-line tool and Python library that filters the [GitHub Archive](https://www.gharchive.org/) dataset on consumer hardware. Each hourly archive is streamed through memory, filtered against your criteria, and written out as Parquet or JSONL. Peak local storage stays bounded by a single in-flight download (about 150 MB) regardless of how long a window you process.
|
|
37
|
+
|
|
38
|
+
---
|
|
39
|
+
|
|
40
|
+
## Why gharc?
|
|
41
|
+
|
|
42
|
+
The full GitHub Archive dataset exceeds petabytes in size. Traditional analysis requires either massive local storage or expensive cloud warehousing (BigQuery).
|
|
43
|
+
|
|
44
|
+
`gharc` solves this by implementing a **Stream-and-Filter** architecture:
|
|
45
|
+
1. **Streaming:** Downloads each hourly archive (~60 to 150 MB compressed in 2024) to a temporary file.
|
|
46
|
+
2. **Filtering:** Extracts only events matching your criteria (e.g., specific repos or event types).
|
|
47
|
+
3. **Writing:** Streams matching events into a single **Parquet** or **JSONL** file via `pyarrow.ParquetWriter` for true append.
|
|
48
|
+
4. **Cleanup:** Deletes the temporary download immediately after, so disk usage never accumulates.
|
|
49
|
+
|
|
50
|
+
**Ideal for:**
|
|
51
|
+
- Academic research on Open Source Software (OSS).
|
|
52
|
+
- Large scale data mining on consumer hardware.
|
|
53
|
+
- Creating custom datasets for specific organizations or ecosystems.
|
|
54
|
+
|
|
55
|
+

|
|
56
|
+
|
|
57
|
+
---
|
|
58
|
+
|
|
59
|
+
## Key Features
|
|
60
|
+
|
|
61
|
+
* **Zero-Storage Overhead:** Processes terabytes of data with a constant disk footprint of <100MB.
|
|
62
|
+
* **Resumable Downloads:** Smart handling of network interruptions (common with residential internet) using HTTP Range requests.
|
|
63
|
+
* **High Performance:**
|
|
64
|
+
* Parallel processing with thread pools.
|
|
65
|
+
* Optimized "Fast String Check" (zero-copy filtering) to skip irrelevant data.
|
|
66
|
+
* Optional `orjson` support for 3-5x faster parsing.
|
|
67
|
+
* **Parquet Native:** Outputs columnar data ready for Pandas, Spark, or Polars, often reducing file size by 90% compared to JSON.
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## Performance
|
|
72
|
+
|
|
73
|
+
Measured on a Windows 11 laptop (12 logical cores, 15 GB RAM) over a typical residential connection. Reproducible scripts in [`benchmarks/`](benchmarks/).
|
|
74
|
+
|
|
75
|
+
A six-hour window of GHArchive (2024-01-01 00:00 to 06:00 UTC), filtered to `apache/spark`:
|
|
76
|
+
|
|
77
|
+
| Workers | Wall-clock | Hours/sec | Spark events | Peak RSS |
|
|
78
|
+
|---|---|---|---|---|
|
|
79
|
+
| 1 | 76.0 s | 0.079 | 14 | 94.2 MB |
|
|
80
|
+
| 4 | 58.1 s | 0.103 | 14 | 106.7 MB |
|
|
81
|
+
|
|
82
|
+
Both runs recovered the same events, so concurrency does not affect output. Peak RSS stays below 110 MB. The bottleneck on residential links is HTTPS download throughput rather than CPU; additional workers help up to a point and then saturate the connection.
|
|
83
|
+
|
|
84
|
+
The same six-hour window comprises about 1.2 GB of compressed source on the GHArchive side, while the filtered Parquet output is 53 KB. That is a storage saving of roughly 22,000 to 1, and at no point does peak local disk exceed the size of a single in-flight temporary file (about 150 MB).
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
## Installation
|
|
89
|
+
|
|
90
|
+
### Prerequisites
|
|
91
|
+
- Python 3.8 or higher
|
|
92
|
+
- `pip`
|
|
93
|
+
|
|
94
|
+
### Install from Source
|
|
95
|
+
```bash
|
|
96
|
+
git clone https://github.com/aravpanwar/gharc.git
|
|
97
|
+
cd gharc
|
|
98
|
+
python3 -m venv venv
|
|
99
|
+
source venv/bin/activate
|
|
100
|
+
pip install -e .
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Optional Performance Boost
|
|
104
|
+
|
|
105
|
+
For maximum speed, install with the `fast` extra. `gharc` detects and uses `orjson` automatically when available.
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
pip install -e ".[fast]"
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
## Usage
|
|
114
|
+
|
|
115
|
+
### Basic Command
|
|
116
|
+
|
|
117
|
+
Download all activity for a specific repository over a one-day window.
|
|
118
|
+
Note that `--end` is exclusive, so this covers all 24 hours of 2024-01-01.
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
gharc download \
|
|
122
|
+
--start 2024-01-01 \
|
|
123
|
+
--end 2024-01-02 \
|
|
124
|
+
--repos "apache/spark" \
|
|
125
|
+
--output spark_data.parquet
|
|
126
|
+
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### Advanced Filtering
|
|
130
|
+
|
|
131
|
+
Filter for multiple repositories and specific event types (e.g., only Pull Requests and Pushes).
|
|
132
|
+
This covers all of June 2023 (June 1 inclusive through July 1 exclusive).
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
gharc download \
|
|
136
|
+
--start 2023-06-01 \
|
|
137
|
+
--end 2023-07-01 \
|
|
138
|
+
--repos "apache/spark, pandas-dev/pandas, pytorch/pytorch" \
|
|
139
|
+
--event-types "PullRequestEvent, PushEvent" \
|
|
140
|
+
--output oss_summer_2023.parquet \
|
|
141
|
+
--workers 4
|
|
142
|
+
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### Arguments
|
|
146
|
+
|
|
147
|
+
| Argument | Description | Example |
|
|
148
|
+
| --- | --- | --- |
|
|
149
|
+
| `--start` | Start date, inclusive (YYYY-MM-DD or YYYY-MM-DD-HH) | `2024-01-01` |
|
|
150
|
+
| `--end` | End date, exclusive (YYYY-MM-DD or YYYY-MM-DD-HH) | `2024-02-01` |
|
|
151
|
+
| `--repos` | Comma-separated list of repositories to keep | `apache/spark,tensorflow/tensorflow` |
|
|
152
|
+
| `--event-types` | Comma-separated list of GHArchive event types | `WatchEvent,ForkEvent` |
|
|
153
|
+
| `--output` | Output filename (`.parquet` or `.jsonl`) | `data.parquet` |
|
|
154
|
+
| `--workers` | Number of parallel download threads (default: 4) | `8` |
|
|
155
|
+
|
|
156
|
+
---
|
|
157
|
+
|
|
158
|
+
## Resumable runs
|
|
159
|
+
|
|
160
|
+
For long jobs, `gharc` keeps a small `<output>.state.json` next to the output file listing which hours it has already processed. If the run crashes, restarting the same command picks up where it left off rather than redoing completed hours. The state file is removed automatically when the run finishes cleanly.
|
|
161
|
+
|
|
162
|
+
Resume support requires JSONL output. Parquet writers cannot append to a closed file, so for multi-hour runs use `--output run.jsonl` and convert to Parquet at the end:
|
|
163
|
+
|
|
164
|
+
```bash
|
|
165
|
+
gharc convert run.jsonl run.parquet
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
## Python API
|
|
171
|
+
|
|
172
|
+
The CLI is a thin wrapper around `gharc.process_range`, which you can call directly:
|
|
173
|
+
|
|
174
|
+
```python
|
|
175
|
+
from datetime import datetime
|
|
176
|
+
import gharc
|
|
177
|
+
|
|
178
|
+
gharc.setup_logging()
|
|
179
|
+
gharc.process_range(
|
|
180
|
+
start=datetime(2024, 1, 1),
|
|
181
|
+
end=datetime(2024, 1, 2),
|
|
182
|
+
repos=["apache/spark"],
|
|
183
|
+
event_types=None,
|
|
184
|
+
output="spark_one_day.jsonl",
|
|
185
|
+
workers=4,
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
gharc.jsonl_to_parquet("spark_one_day.jsonl", "spark_one_day.parquet")
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
`__all__` in `gharc/__init__.py` lists the public surface (`process_range`, `jsonl_to_parquet`, `DataWriter`, `parse_date`, `date_range`, `get_url_for_time`, `setup_logging`, plus the filter helpers).
|
|
192
|
+
|
|
193
|
+
---
|
|
194
|
+
|
|
195
|
+
## Automating Bulk Downloads
|
|
196
|
+
|
|
197
|
+
For long date ranges, the included [`examples/orchestrator.py`](examples/orchestrator.py) script runs `gharc` month by month so each year produces one Parquet file per month rather than one giant output:
|
|
198
|
+
|
|
199
|
+
```bash
|
|
200
|
+
python examples/orchestrator.py \
|
|
201
|
+
--start 2023-01-01 \
|
|
202
|
+
--end 2024-01-01 \
|
|
203
|
+
--repos "apache/spark,pandas-dev/pandas" \
|
|
204
|
+
--output-dir ./gharc_out \
|
|
205
|
+
--workers 4
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
---
|
|
209
|
+
|
|
210
|
+
## Repository Layout
|
|
211
|
+
|
|
212
|
+
```
|
|
213
|
+
gharc/
|
|
214
|
+
├── src/gharc/ # Library + CLI entry point
|
|
215
|
+
├── tests/ # pytest test suite
|
|
216
|
+
├── benchmarks/ # Reproducible runs that back the performance claims
|
|
217
|
+
├── examples/ # Driver scripts (e.g. month-by-month orchestrator)
|
|
218
|
+
├── paper/ # paper.md, paper.bib, figures (the JOSS submission)
|
|
219
|
+
└── CITATION.cff # GitHub-detectable citation metadata
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
---
|
|
223
|
+
|
|
224
|
+
## Contributing
|
|
225
|
+
|
|
226
|
+
Contributions are welcome. Please read [CONTRIBUTING.md](CONTRIBUTING.md) for details on the process for submitting pull requests.
|
|
227
|
+
|
|
228
|
+
**Running Tests:**
|
|
229
|
+
|
|
230
|
+
```bash
|
|
231
|
+
pip install -e ".[test]"
|
|
232
|
+
pytest tests/
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
---
|
|
236
|
+
|
|
237
|
+
## Citation
|
|
238
|
+
|
|
239
|
+
The accompanying paper is at [`paper/paper.pdf`](paper/paper.pdf) and is rebuilt automatically on every push by the [Paper CI workflow](.github/workflows/paper.yml).
|
|
240
|
+
|
|
241
|
+
If you use `gharc` in your research, please cite it using the metadata in `CITATION.cff` or as follows:
|
|
242
|
+
|
|
243
|
+
```bibtex
|
|
244
|
+
@software{gharc2026,
|
|
245
|
+
author = {Panwar, Arav},
|
|
246
|
+
title = {gharc: A stream-and-filter tool for the GitHub Archive on consumer hardware},
|
|
247
|
+
year = {2026},
|
|
248
|
+
url = {https://github.com/aravpanwar/gharc}
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
---
|
|
254
|
+
|
|
255
|
+
## License
|
|
256
|
+
|
|
257
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
258
|
+
|
|
259
|
+
Created by Arav Panwar
|
|
260
|
+
[aravpanwar.com](https://www.aravpanwar.com)
|
|
261
|
+
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
gharc/__init__.py,sha256=ueXnxVXSj8uUDs896_Xy2WC7zRJS-7vvBgWHgqNsq8c,464
|
|
2
|
+
gharc/cli.py,sha256=VOwHaP8GW43Wsasj5VaW7ScxIJsgYHAKwIFHGnAqaMA,1808
|
|
3
|
+
gharc/filters.py,sha256=lZus0uhC_4v6wct6GpEtAX8QZGlJyROIBD2tbGcj2k8,934
|
|
4
|
+
gharc/storage.py,sha256=ExJe9NtzZxCqlkZ0f7czjjt_rF3MtdVSN-cz8rVI3mU,4398
|
|
5
|
+
gharc/streamer.py,sha256=QxpUf0jv6hPalhdhYF_CUYY40iVhd2l3IgZIPdjOVmk,9170
|
|
6
|
+
gharc/utils.py,sha256=HZjOC_HS3NbTPIspxOVulMkxKP-P1XocLMOvzelEAdg,2090
|
|
7
|
+
gharc-0.1.0.dist-info/licenses/LICENSE,sha256=hLV7YOq2UYYw8ediyrCxcRSUos0HsY5SgDchmEhFWwk,1089
|
|
8
|
+
gharc-0.1.0.dist-info/METADATA,sha256=DP_5BWZaIukrLG3kRvEg9dwxGRBoTHjTi5Ho4uSPudU,9572
|
|
9
|
+
gharc-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
10
|
+
gharc-0.1.0.dist-info/entry_points.txt,sha256=Zlj3sVYsSFLRTsk1zFdzeWcMmeFP2KCJHhLQK6KpcTw,41
|
|
11
|
+
gharc-0.1.0.dist-info/top_level.txt,sha256=DvL9ErtXeThzfYDkvOo2dscEIKt7UDXH1WMZmYEjSxw,6
|
|
12
|
+
gharc-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Arav Panwar
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
gharc
|