gharc 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,32 @@
1
+ cff-version: 1.2.0
2
+ message: "If you use this software, please cite it as below."
3
+ type: software
4
+ title: "gharc: A stream-and-filter tool for the GitHub Archive on consumer hardware"
5
+ abstract: >-
6
+ gharc filters the GitHub Archive on consumer hardware by streaming
7
+ each hourly archive through memory and writing only matching events
8
+ to Parquet or JSONL. Peak local storage stays bounded by a single
9
+ in-flight download regardless of the time range processed.
10
+ authors:
11
+ - family-names: Panwar
12
+ given-names: Arav
13
+ orcid: https://orcid.org/0009-0009-3013-5970
14
+ version: 0.1.0
15
+ date-released: 2026-04-26
16
+ license: MIT
17
+ repository-code: https://github.com/aravpanwar/gharc
18
+ doi: 10.5281/zenodo.19814232
19
+ identifiers:
20
+ - type: doi
21
+ value: 10.5281/zenodo.19814232
22
+ description: Concept DOI for all versions
23
+ - type: doi
24
+ value: 10.5281/zenodo.19814233
25
+ description: DOI for v0.1.0
26
+ keywords:
27
+ - github
28
+ - mining-software-repositories
29
+ - msr
30
+ - stream-processing
31
+ - parquet
32
+ - python
gharc-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Arav Panwar
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,6 @@
1
+ include README.md
2
+ include LICENSE
3
+ include CITATION.cff
4
+ recursive-include paper *
5
+ recursive-include examples *.py
6
+ recursive-include tests *.py
gharc-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,261 @@
1
+ Metadata-Version: 2.4
2
+ Name: gharc
3
+ Version: 0.1.0
4
+ Summary: A stream-processing tool for GitHub Archive data filtering.
5
+ Author-email: Arav Panwar <aravpanwar@outlook.com>
6
+ Project-URL: Homepage, https://github.com/aravpanwar/gharc
7
+ Project-URL: Bug Tracker, https://github.com/aravpanwar/gharc/issues
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
12
+ Requires-Python: >=3.8
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Requires-Dist: requests>=2.28.0
16
+ Requires-Dist: click>=8.0.0
17
+ Requires-Dist: pandas>=2.0.0
18
+ Requires-Dist: pyarrow>=12.0.0
19
+ Requires-Dist: tqdm>=4.65.0
20
+ Provides-Extra: fast
21
+ Requires-Dist: orjson>=3.9.0; extra == "fast"
22
+ Provides-Extra: test
23
+ Requires-Dist: pytest>=7.0; extra == "test"
24
+ Dynamic: license-file
25
+
26
+ # gharc: GitHub Archive Stream-Processor
27
+
28
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
29
+ [![Tests](https://github.com/aravpanwar/gharc/actions/workflows/test.yml/badge.svg)](https://github.com/aravpanwar/gharc/actions)
30
+ [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
31
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
32
+ [![DOI](https://zenodo.org/badge/1112791047.svg)](https://doi.org/10.5281/zenodo.19814232)
33
+
34
+ **Mine the GitHub Archive on a standard laptop.**
35
+
36
+ `gharc` is a command-line tool and Python library that filters the [GitHub Archive](https://www.gharchive.org/) dataset on consumer hardware. Each hourly archive is streamed through memory, filtered against your criteria, and written out as Parquet or JSONL. Peak local storage stays bounded by a single in-flight download (about 150 MB) regardless of how long a window you process.
37
+
38
+ ---
39
+
40
+ ## Why gharc?
41
+
42
+ The full GitHub Archive dataset exceeds petabytes in size. Traditional analysis requires either massive local storage or expensive cloud warehousing (BigQuery).
43
+
44
+ `gharc` solves this by implementing a **Stream-and-Filter** architecture:
45
+ 1. **Streaming:** Downloads each hourly archive (~60 to 150 MB compressed in 2024) to a temporary file.
46
+ 2. **Filtering:** Extracts only events matching your criteria (e.g., specific repos or event types).
47
+ 3. **Writing:** Streams matching events into a single **Parquet** or **JSONL** file via `pyarrow.ParquetWriter` for true append.
48
+ 4. **Cleanup:** Deletes the temporary download immediately after, so disk usage never accumulates.
49
+
50
+ **Ideal for:**
51
+ - Academic research on Open Source Software (OSS).
52
+ - Large scale data mining on consumer hardware.
53
+ - Creating custom datasets for specific organizations or ecosystems.
54
+
55
+ ![Architecture: GHArchive HTTPS to thread pool to resumable download to temp file to streaming decode and filter to DataWriter to output file.](paper/figures/architecture.png)
56
+
57
+ ---
58
+
59
+ ## Key Features
60
+
61
+ * **Zero-Storage Overhead:** Processes terabytes of data with a constant disk footprint of <100MB.
62
+ * **Resumable Downloads:** Smart handling of network interruptions (common with residential internet) using HTTP Range requests.
63
+ * **High Performance:**
64
+ * Parallel processing with thread pools.
65
+ * Optimized "Fast String Check" (zero-copy filtering) to skip irrelevant data.
66
+ * Optional `orjson` support for 3-5x faster parsing.
67
+ * **Parquet Native:** Outputs columnar data ready for Pandas, Spark, or Polars, often reducing file size by 90% compared to JSON.
68
+
69
+ ---
70
+
71
+ ## Performance
72
+
73
+ Measured on a Windows 11 laptop (12 logical cores, 15 GB RAM) over a typical residential connection. Reproducible scripts in [`benchmarks/`](benchmarks/).
74
+
75
+ A six-hour window of GHArchive (2024-01-01 00:00 to 06:00 UTC), filtered to `apache/spark`:
76
+
77
+ | Workers | Wall-clock | Hours/sec | Spark events | Peak RSS |
78
+ |---|---|---|---|---|
79
+ | 1 | 76.0 s | 0.079 | 14 | 94.2 MB |
80
+ | 4 | 58.1 s | 0.103 | 14 | 106.7 MB |
81
+
82
+ Both runs recovered the same events, so concurrency does not affect output. Peak RSS stays below 110 MB. The bottleneck on residential links is HTTPS download throughput rather than CPU; additional workers help up to a point and then saturate the connection.
83
+
84
+ The same six-hour window comprises about 1.2 GB of compressed source on the GHArchive side, while the filtered Parquet output is 53 KB. That is a storage saving of roughly 22,000 to 1, and at no point does peak local disk exceed the size of a single in-flight temporary file (about 150 MB).
85
+
86
+ ---
87
+
88
+ ## Installation
89
+
90
+ ### Prerequisites
91
+ - Python 3.8 or higher
92
+ - `pip`
93
+
94
+ ### Install from Source
95
+ ```bash
96
+ git clone https://github.com/aravpanwar/gharc.git
97
+ cd gharc
98
+ python3 -m venv venv
99
+ source venv/bin/activate
100
+ pip install -e .
101
+ ```
102
+
103
+ ### Optional Performance Boost
104
+
105
+ For maximum speed, install with the `fast` extra. `gharc` detects and uses `orjson` automatically when available.
106
+
107
+ ```bash
108
+ pip install -e ".[fast]"
109
+ ```
110
+
111
+ ---
112
+
113
+ ## Usage
114
+
115
+ ### Basic Command
116
+
117
+ Download all activity for a specific repository over a one-day window.
118
+ Note that `--end` is exclusive, so this covers all 24 hours of 2024-01-01.
119
+
120
+ ```bash
121
+ gharc download \
122
+ --start 2024-01-01 \
123
+ --end 2024-01-02 \
124
+ --repos "apache/spark" \
125
+ --output spark_data.parquet
126
+
127
+ ```
128
+
129
+ ### Advanced Filtering
130
+
131
+ Filter for multiple repositories and specific event types (e.g., only Pull Requests and Pushes).
132
+ This covers all of June 2023 (June 1 inclusive through July 1 exclusive).
133
+
134
+ ```bash
135
+ gharc download \
136
+ --start 2023-06-01 \
137
+ --end 2023-07-01 \
138
+ --repos "apache/spark, pandas-dev/pandas, pytorch/pytorch" \
139
+ --event-types "PullRequestEvent, PushEvent" \
140
+ --output oss_summer_2023.parquet \
141
+ --workers 4
142
+
143
+ ```
144
+
145
+ ### Arguments
146
+
147
+ | Argument | Description | Example |
148
+ | --- | --- | --- |
149
+ | `--start` | Start date, inclusive (YYYY-MM-DD or YYYY-MM-DD-HH) | `2024-01-01` |
150
+ | `--end` | End date, exclusive (YYYY-MM-DD or YYYY-MM-DD-HH) | `2024-02-01` |
151
+ | `--repos` | Comma-separated list of repositories to keep | `apache/spark,tensorflow/tensorflow` |
152
+ | `--event-types` | Comma-separated list of GHArchive event types | `WatchEvent,ForkEvent` |
153
+ | `--output` | Output filename (`.parquet` or `.jsonl`) | `data.parquet` |
154
+ | `--workers` | Number of parallel download threads (default: 4) | `8` |
155
+
156
+ ---
157
+
158
+ ## Resumable runs
159
+
160
+ For long jobs, `gharc` keeps a small `<output>.state.json` next to the output file listing which hours it has already processed. If the run crashes, restarting the same command picks up where it left off rather than redoing completed hours. The state file is removed automatically when the run finishes cleanly.
161
+
162
+ Resume support requires JSONL output. Parquet writers cannot append to a closed file, so for multi-hour runs use `--output run.jsonl` and convert to Parquet at the end:
163
+
164
+ ```bash
165
+ gharc convert run.jsonl run.parquet
166
+ ```
167
+
168
+ ---
169
+
170
+ ## Python API
171
+
172
+ The CLI is a thin wrapper around `gharc.process_range`, which you can call directly:
173
+
174
+ ```python
175
+ from datetime import datetime
176
+ import gharc
177
+
178
+ gharc.setup_logging()
179
+ gharc.process_range(
180
+ start=datetime(2024, 1, 1),
181
+ end=datetime(2024, 1, 2),
182
+ repos=["apache/spark"],
183
+ event_types=None,
184
+ output="spark_one_day.jsonl",
185
+ workers=4,
186
+ )
187
+
188
+ gharc.jsonl_to_parquet("spark_one_day.jsonl", "spark_one_day.parquet")
189
+ ```
190
+
191
+ `__all__` in `gharc/__init__.py` lists the public surface (`process_range`, `jsonl_to_parquet`, `DataWriter`, `parse_date`, `date_range`, `get_url_for_time`, `setup_logging`, plus the filter helpers).
192
+
193
+ ---
194
+
195
+ ## Automating Bulk Downloads
196
+
197
+ For long date ranges, the included [`examples/orchestrator.py`](examples/orchestrator.py) script runs `gharc` month by month so each year produces one Parquet file per month rather than one giant output:
198
+
199
+ ```bash
200
+ python examples/orchestrator.py \
201
+ --start 2023-01-01 \
202
+ --end 2024-01-01 \
203
+ --repos "apache/spark,pandas-dev/pandas" \
204
+ --output-dir ./gharc_out \
205
+ --workers 4
206
+ ```
207
+
208
+ ---
209
+
210
+ ## Repository Layout
211
+
212
+ ```
213
+ gharc/
214
+ ├── src/gharc/ # Library + CLI entry point
215
+ ├── tests/ # pytest test suite
216
+ ├── benchmarks/ # Reproducible runs that back the performance claims
217
+ ├── examples/ # Driver scripts (e.g. month-by-month orchestrator)
218
+ ├── paper/ # paper.md, paper.bib, figures (the JOSS submission)
219
+ └── CITATION.cff # GitHub-detectable citation metadata
220
+ ```
221
+
222
+ ---
223
+
224
+ ## Contributing
225
+
226
+ Contributions are welcome. Please read [CONTRIBUTING.md](CONTRIBUTING.md) for details on the process for submitting pull requests.
227
+
228
+ **Running Tests:**
229
+
230
+ ```bash
231
+ pip install -e ".[test]"
232
+ pytest tests/
233
+ ```
234
+
235
+ ---
236
+
237
+ ## Citation
238
+
239
+ The accompanying paper is at [`paper/paper.pdf`](paper/paper.pdf) and is rebuilt automatically on every push by the [Paper CI workflow](.github/workflows/paper.yml).
240
+
241
+ If you use `gharc` in your research, please cite it using the metadata in `CITATION.cff` or as follows:
242
+
243
+ ```bibtex
244
+ @software{gharc2026,
245
+ author = {Panwar, Arav},
246
+ title = {gharc: A stream-and-filter tool for the GitHub Archive on consumer hardware},
247
+ year = {2026},
248
+ url = {https://github.com/aravpanwar/gharc}
249
+ }
250
+
251
+ ```
252
+
253
+ ---
254
+
255
+ ## License
256
+
257
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
258
+
259
+ Created by Arav Panwar
260
+ [aravpanwar.com](https://www.aravpanwar.com)
261
+
gharc-0.1.0/README.md ADDED
@@ -0,0 +1,236 @@
1
+ # gharc: GitHub Archive Stream-Processor
2
+
3
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
4
+ [![Tests](https://github.com/aravpanwar/gharc/actions/workflows/test.yml/badge.svg)](https://github.com/aravpanwar/gharc/actions)
5
+ [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
6
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
7
+ [![DOI](https://zenodo.org/badge/1112791047.svg)](https://doi.org/10.5281/zenodo.19814232)
8
+
9
+ **Mine the GitHub Archive on a standard laptop.**
10
+
11
+ `gharc` is a command-line tool and Python library that filters the [GitHub Archive](https://www.gharchive.org/) dataset on consumer hardware. Each hourly archive is streamed through memory, filtered against your criteria, and written out as Parquet or JSONL. Peak local storage stays bounded by a single in-flight download (about 150 MB) regardless of how long a window you process.
12
+
13
+ ---
14
+
15
+ ## Why gharc?
16
+
17
+ The full GitHub Archive dataset exceeds petabytes in size. Traditional analysis requires either massive local storage or expensive cloud warehousing (BigQuery).
18
+
19
+ `gharc` solves this by implementing a **Stream-and-Filter** architecture:
20
+ 1. **Streaming:** Downloads each hourly archive (~60 to 150 MB compressed in 2024) to a temporary file.
21
+ 2. **Filtering:** Extracts only events matching your criteria (e.g., specific repos or event types).
22
+ 3. **Writing:** Streams matching events into a single **Parquet** or **JSONL** file via `pyarrow.ParquetWriter` for true append.
23
+ 4. **Cleanup:** Deletes the temporary download immediately after, so disk usage never accumulates.
24
+
25
+ **Ideal for:**
26
+ - Academic research on Open Source Software (OSS).
27
+ - Large scale data mining on consumer hardware.
28
+ - Creating custom datasets for specific organizations or ecosystems.
29
+
30
+ ![Architecture: GHArchive HTTPS to thread pool to resumable download to temp file to streaming decode and filter to DataWriter to output file.](paper/figures/architecture.png)
31
+
32
+ ---
33
+
34
+ ## Key Features
35
+
36
+ * **Zero-Storage Overhead:** Processes terabytes of data with a constant disk footprint of <100MB.
37
+ * **Resumable Downloads:** Smart handling of network interruptions (common with residential internet) using HTTP Range requests.
38
+ * **High Performance:**
39
+ * Parallel processing with thread pools.
40
+ * Optimized "Fast String Check" (zero-copy filtering) to skip irrelevant data.
41
+ * Optional `orjson` support for 3-5x faster parsing.
42
+ * **Parquet Native:** Outputs columnar data ready for Pandas, Spark, or Polars, often reducing file size by 90% compared to JSON.
43
+
44
+ ---
45
+
46
+ ## Performance
47
+
48
+ Measured on a Windows 11 laptop (12 logical cores, 15 GB RAM) over a typical residential connection. Reproducible scripts in [`benchmarks/`](benchmarks/).
49
+
50
+ A six-hour window of GHArchive (2024-01-01 00:00 to 06:00 UTC), filtered to `apache/spark`:
51
+
52
+ | Workers | Wall-clock | Hours/sec | Spark events | Peak RSS |
53
+ |---|---|---|---|---|
54
+ | 1 | 76.0 s | 0.079 | 14 | 94.2 MB |
55
+ | 4 | 58.1 s | 0.103 | 14 | 106.7 MB |
56
+
57
+ Both runs recovered the same events, so concurrency does not affect output. Peak RSS stays below 110 MB. The bottleneck on residential links is HTTPS download throughput rather than CPU; additional workers help up to a point and then saturate the connection.
58
+
59
+ The same six-hour window comprises about 1.2 GB of compressed source on the GHArchive side, while the filtered Parquet output is 53 KB. That is a storage saving of roughly 22,000 to 1, and at no point does peak local disk exceed the size of a single in-flight temporary file (about 150 MB).
60
+
61
+ ---
62
+
63
+ ## Installation
64
+
65
+ ### Prerequisites
66
+ - Python 3.8 or higher
67
+ - `pip`
68
+
69
+ ### Install from Source
70
+ ```bash
71
+ git clone https://github.com/aravpanwar/gharc.git
72
+ cd gharc
73
+ python3 -m venv venv
74
+ source venv/bin/activate
75
+ pip install -e .
76
+ ```
77
+
78
+ ### Optional Performance Boost
79
+
80
+ For maximum speed, install with the `fast` extra. `gharc` detects and uses `orjson` automatically when available.
81
+
82
+ ```bash
83
+ pip install -e ".[fast]"
84
+ ```
85
+
86
+ ---
87
+
88
+ ## Usage
89
+
90
+ ### Basic Command
91
+
92
+ Download all activity for a specific repository over a one-day window.
93
+ Note that `--end` is exclusive, so this covers all 24 hours of 2024-01-01.
94
+
95
+ ```bash
96
+ gharc download \
97
+ --start 2024-01-01 \
98
+ --end 2024-01-02 \
99
+ --repos "apache/spark" \
100
+ --output spark_data.parquet
101
+
102
+ ```
103
+
104
+ ### Advanced Filtering
105
+
106
+ Filter for multiple repositories and specific event types (e.g., only Pull Requests and Pushes).
107
+ This covers all of June 2023 (June 1 inclusive through July 1 exclusive).
108
+
109
+ ```bash
110
+ gharc download \
111
+ --start 2023-06-01 \
112
+ --end 2023-07-01 \
113
+ --repos "apache/spark, pandas-dev/pandas, pytorch/pytorch" \
114
+ --event-types "PullRequestEvent, PushEvent" \
115
+ --output oss_summer_2023.parquet \
116
+ --workers 4
117
+
118
+ ```
119
+
120
+ ### Arguments
121
+
122
+ | Argument | Description | Example |
123
+ | --- | --- | --- |
124
+ | `--start` | Start date, inclusive (YYYY-MM-DD or YYYY-MM-DD-HH) | `2024-01-01` |
125
+ | `--end` | End date, exclusive (YYYY-MM-DD or YYYY-MM-DD-HH) | `2024-02-01` |
126
+ | `--repos` | Comma-separated list of repositories to keep | `apache/spark,tensorflow/tensorflow` |
127
+ | `--event-types` | Comma-separated list of GHArchive event types | `WatchEvent,ForkEvent` |
128
+ | `--output` | Output filename (`.parquet` or `.jsonl`) | `data.parquet` |
129
+ | `--workers` | Number of parallel download threads (default: 4) | `8` |
130
+
131
+ ---
132
+
133
+ ## Resumable runs
134
+
135
+ For long jobs, `gharc` keeps a small `<output>.state.json` next to the output file listing which hours it has already processed. If the run crashes, restarting the same command picks up where it left off rather than redoing completed hours. The state file is removed automatically when the run finishes cleanly.
136
+
137
+ Resume support requires JSONL output. Parquet writers cannot append to a closed file, so for multi-hour runs use `--output run.jsonl` and convert to Parquet at the end:
138
+
139
+ ```bash
140
+ gharc convert run.jsonl run.parquet
141
+ ```
142
+
143
+ ---
144
+
145
+ ## Python API
146
+
147
+ The CLI is a thin wrapper around `gharc.process_range`, which you can call directly:
148
+
149
+ ```python
150
+ from datetime import datetime
151
+ import gharc
152
+
153
+ gharc.setup_logging()
154
+ gharc.process_range(
155
+ start=datetime(2024, 1, 1),
156
+ end=datetime(2024, 1, 2),
157
+ repos=["apache/spark"],
158
+ event_types=None,
159
+ output="spark_one_day.jsonl",
160
+ workers=4,
161
+ )
162
+
163
+ gharc.jsonl_to_parquet("spark_one_day.jsonl", "spark_one_day.parquet")
164
+ ```
165
+
166
+ `__all__` in `gharc/__init__.py` lists the public surface (`process_range`, `jsonl_to_parquet`, `DataWriter`, `parse_date`, `date_range`, `get_url_for_time`, `setup_logging`, plus the filter helpers).
167
+
168
+ ---
169
+
170
+ ## Automating Bulk Downloads
171
+
172
+ For long date ranges, the included [`examples/orchestrator.py`](examples/orchestrator.py) script runs `gharc` month by month so each year produces one Parquet file per month rather than one giant output:
173
+
174
+ ```bash
175
+ python examples/orchestrator.py \
176
+ --start 2023-01-01 \
177
+ --end 2024-01-01 \
178
+ --repos "apache/spark,pandas-dev/pandas" \
179
+ --output-dir ./gharc_out \
180
+ --workers 4
181
+ ```
182
+
183
+ ---
184
+
185
+ ## Repository Layout
186
+
187
+ ```
188
+ gharc/
189
+ ├── src/gharc/ # Library + CLI entry point
190
+ ├── tests/ # pytest test suite
191
+ ├── benchmarks/ # Reproducible runs that back the performance claims
192
+ ├── examples/ # Driver scripts (e.g. month-by-month orchestrator)
193
+ ├── paper/ # paper.md, paper.bib, figures (the JOSS submission)
194
+ └── CITATION.cff # GitHub-detectable citation metadata
195
+ ```
196
+
197
+ ---
198
+
199
+ ## Contributing
200
+
201
+ Contributions are welcome. Please read [CONTRIBUTING.md](CONTRIBUTING.md) for details on the process for submitting pull requests.
202
+
203
+ **Running Tests:**
204
+
205
+ ```bash
206
+ pip install -e ".[test]"
207
+ pytest tests/
208
+ ```
209
+
210
+ ---
211
+
212
+ ## Citation
213
+
214
+ The accompanying paper is at [`paper/paper.pdf`](paper/paper.pdf) and is rebuilt automatically on every push by the [Paper CI workflow](.github/workflows/paper.yml).
215
+
216
+ If you use `gharc` in your research, please cite it using the metadata in `CITATION.cff` or as follows:
217
+
218
+ ```bibtex
219
+ @software{gharc2026,
220
+ author = {Panwar, Arav},
221
+ title = {gharc: A stream-and-filter tool for the GitHub Archive on consumer hardware},
222
+ year = {2026},
223
+ url = {https://github.com/aravpanwar/gharc}
224
+ }
225
+
226
+ ```
227
+
228
+ ---
229
+
230
+ ## License
231
+
232
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
233
+
234
+ Created by Arav Panwar
235
+ [aravpanwar.com](https://www.aravpanwar.com)
236
+
@@ -0,0 +1,69 @@
1
+ """Run gharc month-by-month over a long date range.
2
+
3
+ Useful when you're working on a laptop and want one parquet file per month
4
+ rather than one giant file. Edit the configuration at the top to fit your
5
+ machine, then run: python examples/orchestrator.py
6
+ """
7
+ import argparse
8
+ import os
9
+ import subprocess
10
+ from datetime import datetime, timedelta
11
+
12
+
13
+ def get_month_ranges(start, end):
14
+ current = start
15
+ while current < end:
16
+ next_month = (current.replace(day=1) + timedelta(days=32)).replace(day=1)
17
+ chunk_end = min(next_month, end)
18
+
19
+ s_str = current.strftime("%Y-%m-%d-%H")
20
+ e_str = chunk_end.strftime("%Y-%m-%d-%H")
21
+
22
+ yield current, chunk_end, s_str, e_str
23
+ current = next_month
24
+
25
+
26
+ def main():
27
+ p = argparse.ArgumentParser(description="Run gharc month-by-month.")
28
+ p.add_argument("--start", required=True, help="Start date YYYY-MM-DD")
29
+ p.add_argument("--end", required=True, help="End date YYYY-MM-DD (exclusive)")
30
+ p.add_argument("--repos", required=True, help="Comma-separated repos")
31
+ p.add_argument("--output-dir", default="./gharc_out", help="Output directory")
32
+ p.add_argument("--workers", type=int, default=4)
33
+ args = p.parse_args()
34
+
35
+ start = datetime.strptime(args.start, "%Y-%m-%d")
36
+ end = datetime.strptime(args.end, "%Y-%m-%d")
37
+
38
+ os.makedirs(args.output_dir, exist_ok=True)
39
+ print(f"Batch run: {start.date()} to {end.date()}")
40
+ print(f"Output: {args.output_dir}")
41
+
42
+ for start_dt, _end_dt, s_str, e_str in get_month_ranges(start, end):
43
+ month_name = start_dt.strftime("%Y-%m")
44
+ output_file = os.path.join(args.output_dir, f"gharchive_{month_name}.parquet")
45
+
46
+ if os.path.exists(output_file):
47
+ print(f"Skipping {month_name} (file exists)")
48
+ continue
49
+
50
+ print(f"\nProcessing {month_name}...")
51
+
52
+ cmd = [
53
+ "gharc", "download",
54
+ "--start", s_str,
55
+ "--end", e_str,
56
+ "--repos", args.repos,
57
+ "--output", output_file,
58
+ "--workers", str(args.workers),
59
+ ]
60
+
61
+ try:
62
+ subprocess.run(cmd, check=True)
63
+ print(f"Finished {month_name}")
64
+ except subprocess.CalledProcessError:
65
+ print(f"Error processing {month_name}, continuing")
66
+
67
+
68
+ if __name__ == "__main__":
69
+ main()
@@ -0,0 +1,9 @@
1
+ %% gharc stream-and-filter architecture.
2
+ %% Render with mermaid-cli: mmdc -i architecture.mmd -o architecture.png -b transparent
3
+ flowchart LR
4
+ A["GHArchive<br/>HTTPS (per hour)"] --> B["Thread pool<br/>(N workers)"]
5
+ B --> C["Resumable<br/>download"]
6
+ C --> D["Temp .json.gz<br/>(deleted after)"]
7
+ D --> E["Streaming<br/>decode + filter"]
8
+ E --> F["DataWriter<br/>(ParquetWriter / JSONL)"]
9
+ F --> G["Output file<br/>on disk"]
Binary file