@hardlydifficult/repo-processor 1.0.42 → 1.0.44
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +202 -156
- package/package.json +7 -7
package/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# @hardlydifficult/repo-processor
|
|
2
2
|
|
|
3
|
-
Incremental GitHub
|
|
3
|
+
Incremental GitHub repo processor with SHA-based stale detection, parallel file/dir processing, and git-backed YAML persistence.
|
|
4
4
|
|
|
5
5
|
## Installation
|
|
6
6
|
|
|
@@ -10,220 +10,266 @@ npm install @hardlydifficult/repo-processor
|
|
|
10
10
|
|
|
11
11
|
## Quick Start
|
|
12
12
|
|
|
13
|
-
Process all changed files and directories in a GitHub repository, persisting results to a git-backed YAML store:
|
|
14
|
-
|
|
15
13
|
```typescript
|
|
16
14
|
import { RepoProcessor, GitYamlStore } from "@hardlydifficult/repo-processor";
|
|
17
15
|
import { GitHubClient } from "@hardlydifficult/github";
|
|
18
16
|
|
|
19
|
-
|
|
17
|
+
// 1. Configure git-backed YAML store
|
|
20
18
|
const store = new GitYamlStore({
|
|
21
|
-
cloneUrl: "https://github.com/
|
|
22
|
-
localPath: "/tmp/
|
|
23
|
-
resultDir: (owner, repo) =>
|
|
24
|
-
|
|
19
|
+
cloneUrl: "https://github.com/owner/repo.git",
|
|
20
|
+
localPath: "/tmp/repo-store",
|
|
21
|
+
resultDir: (owner, repo) => `results/${owner}/${repo}`,
|
|
22
|
+
authToken: process.env.GITHUB_TOKEN,
|
|
23
|
+
gitUser: { name: "Processor Bot", email: "bot@example.com" },
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
// 2. Define processing callbacks
|
|
27
|
+
const callbacks = {
|
|
28
|
+
shouldProcess: (entry) => entry.type === "blob" && entry.path.endsWith(".ts"),
|
|
29
|
+
processFile: async ({ entry, content }) => ({
|
|
30
|
+
path: entry.path,
|
|
31
|
+
lineCount: content.split("\n").length,
|
|
32
|
+
}),
|
|
33
|
+
processDirectory: async ({ path, subtreeFilePaths, children }) => ({
|
|
34
|
+
path,
|
|
35
|
+
files: subtreeFilePaths.length,
|
|
36
|
+
dirs: children.filter((c) => c.isDir).length,
|
|
37
|
+
}),
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
// 3. Create and run processor
|
|
41
|
+
const github = new GitHubClient({ token: process.env.GITHUB_TOKEN });
|
|
42
|
+
const processor = new RepoProcessor({
|
|
43
|
+
githubClient: github,
|
|
44
|
+
store,
|
|
45
|
+
callbacks,
|
|
25
46
|
});
|
|
26
47
|
|
|
48
|
+
const result = await processor.run("owner", "repo");
|
|
49
|
+
// => { filesProcessed: 12, filesRemoved: 1, dirsProcessed: 4 }
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## RepoProcessor: Incremental Processing
|
|
53
|
+
|
|
54
|
+
`RepoProcessor` executes an incremental pipeline for processing GitHub file trees: fetch tree → diff → process changed files → remove deleted files → resolve stale directories → commit.
|
|
55
|
+
|
|
56
|
+
```typescript
|
|
57
|
+
import { RepoProcessor } from "@hardlydifficult/repo-processor";
|
|
58
|
+
|
|
27
59
|
const processor = new RepoProcessor({
|
|
28
60
|
githubClient,
|
|
29
61
|
store,
|
|
30
|
-
callbacks
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
path: entry.path,
|
|
34
|
-
sha: entry.sha,
|
|
35
|
-
size: content.length,
|
|
36
|
-
lines: content.split("\n").length
|
|
37
|
-
}),
|
|
38
|
-
processDirectory: async ({ path, children, tree }) => ({
|
|
39
|
-
path,
|
|
40
|
-
childrenCount: children.length,
|
|
41
|
-
totalFiles: tree.filter(e => e.type === "blob").length
|
|
42
|
-
})
|
|
43
|
-
},
|
|
44
|
-
concurrency: 5
|
|
62
|
+
callbacks,
|
|
63
|
+
concurrency: 5, // optional (default 5)
|
|
64
|
+
branch: "main", // optional (default "main")
|
|
45
65
|
});
|
|
46
66
|
|
|
47
67
|
const result = await processor.run("owner", "repo", (progress) => {
|
|
48
|
-
console.log(
|
|
68
|
+
console.log(
|
|
69
|
+
`Phase: ${progress.phase}, Files: ${progress.filesCompleted}/${progress.filesTotal}, Dirs: ${progress.dirsCompleted}/${progress.dirsTotal}`
|
|
70
|
+
);
|
|
49
71
|
});
|
|
50
|
-
|
|
51
|
-
console.log(result);
|
|
52
|
-
// {
|
|
53
|
-
// filesProcessed: 12,
|
|
54
|
-
// filesRemoved: 1,
|
|
55
|
-
// dirsProcessed: 4
|
|
56
|
-
// }
|
|
57
72
|
```
|
|
58
73
|
|
|
59
|
-
|
|
74
|
+
### RepoProcessorConfig
|
|
60
75
|
|
|
61
|
-
|
|
76
|
+
| Field | Type | Required | Default |
|
|
77
|
+
|-------|------|----------|---------|
|
|
78
|
+
| `githubClient` | `GitHubClient` | Yes | — |
|
|
79
|
+
| `store` | `ProcessorStore` | Yes | — |
|
|
80
|
+
| `callbacks` | `ProcessorCallbacks` | Yes | — |
|
|
81
|
+
| `concurrency` | `number` | No | `5` |
|
|
82
|
+
| `branch` | `string` | No | `"main"` |
|
|
62
83
|
|
|
63
|
-
|
|
64
|
-
- Fetching the file tree from GitHub
|
|
65
|
-
- Comparing against a stored manifest of file SHAs to detect changes
|
|
66
|
-
- Processing changed files in parallel batches
|
|
67
|
-
- Removing files that no longer exist
|
|
68
|
-
- Resolving and processing stale directories bottom-up
|
|
69
|
-
- Committing results to the git-backed store
|
|
84
|
+
### ProcessingResult
|
|
70
85
|
|
|
71
|
-
|
|
86
|
+
```typescript
|
|
87
|
+
{
|
|
88
|
+
filesProcessed: number; // Count of files processed
|
|
89
|
+
filesRemoved: number; // Count of deleted files
|
|
90
|
+
dirsProcessed: number; // Count of directories processed
|
|
91
|
+
}
|
|
92
|
+
```
|
|
72
93
|
|
|
73
|
-
|
|
74
|
-
|--------|-------------|---------|
|
|
75
|
-
| `githubClient` | GitHub client for tree and file access | — |
|
|
76
|
-
| `store` | Persistence layer for file/dir results and manifests | — |
|
|
77
|
-
| `callbacks` | Domain logic for filtering and processing | — |
|
|
78
|
-
| `concurrency` | Max parallel file/dir operations | `5` |
|
|
79
|
-
| `branch` | Git branch to use | `"main"` |
|
|
94
|
+
### File and Directory Contexts
|
|
80
95
|
|
|
81
|
-
|
|
96
|
+
```typescript
|
|
97
|
+
interface FileContext {
|
|
98
|
+
entry: TreeEntry;
|
|
99
|
+
content: string;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
interface DirectoryContext {
|
|
103
|
+
path: string;
|
|
104
|
+
sha: string;
|
|
105
|
+
subtreeFilePaths: string[];
|
|
106
|
+
children: DirectoryChild[];
|
|
107
|
+
tree: TreeEntry[];
|
|
108
|
+
}
|
|
82
109
|
|
|
83
|
-
|
|
110
|
+
interface DirectoryChild {
|
|
111
|
+
name: string;
|
|
112
|
+
isDir: boolean;
|
|
113
|
+
fullPath: string;
|
|
114
|
+
}
|
|
115
|
+
```
|
|
84
116
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
117
|
+
## RepoWatcher: SHA-based Triggering
|
|
118
|
+
|
|
119
|
+
`RepoWatcher` monitors GitHub repos for SHA changes and triggers processing with automatic retries, concurrency control, and state persistence.
|
|
88
120
|
|
|
89
121
|
```typescript
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
122
|
+
import { RepoWatcher } from "@hardlydifficult/repo-processor";
|
|
123
|
+
|
|
124
|
+
const watcher = new RepoWatcher({
|
|
125
|
+
stateKey: "repo-state",
|
|
126
|
+
stateDirectory: "/tmp/state",
|
|
127
|
+
run: async (owner, name) => {
|
|
128
|
+
const processor = new RepoProcessor({ /* config */ });
|
|
129
|
+
return processor.run(owner, name);
|
|
130
|
+
},
|
|
131
|
+
onComplete: (owner, name, result, sha) => {
|
|
132
|
+
console.log(`Completed ${owner}/${name}: ${result.filesProcessed} files`);
|
|
133
|
+
},
|
|
134
|
+
onError: (owner, name, error) => {
|
|
135
|
+
console.error(`Failed ${owner}/${name}:`, error);
|
|
136
|
+
},
|
|
137
|
+
maxAttempts: 3, // optional retries
|
|
95
138
|
});
|
|
96
139
|
|
|
97
|
-
|
|
98
|
-
const validated = await store.loadFileResult("owner", "repo", "src/index.ts", z.object({
|
|
99
|
-
path: z.string(),
|
|
100
|
-
sha: z.string(),
|
|
101
|
-
size: z.number(),
|
|
102
|
-
lines: z.number()
|
|
103
|
-
}));
|
|
104
|
-
```
|
|
140
|
+
await watcher.init();
|
|
105
141
|
|
|
106
|
-
|
|
142
|
+
// Handle push events (SHA comparison performed automatically)
|
|
143
|
+
watcher.handlePush("hardlydifficult", "typescript", "abc123...");
|
|
107
144
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
2. Directories whose stored tree SHA doesn’t match the current tree SHA
|
|
145
|
+
// Manual trigger (no SHA comparison)
|
|
146
|
+
watcher.trigger("hardlydifficult", "typescript");
|
|
111
147
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
const staleDirs = await resolveStaleDirectories(
|
|
116
|
-
"owner",
|
|
117
|
-
"repo",
|
|
118
|
-
diff.staleDirs, // dirs flagged by diffTree
|
|
119
|
-
allFilePaths, // current file paths in tree
|
|
120
|
-
tree, // full tree array
|
|
121
|
-
store // store for SHA comparison
|
|
122
|
-
);
|
|
148
|
+
// Synchronous trigger (blocks until complete)
|
|
149
|
+
const response = await watcher.triggerManual("hardlydifficult", "typescript");
|
|
150
|
+
// => { success: true, result: ProcessingResult } | { success: false, reason: string }
|
|
123
151
|
```
|
|
124
152
|
|
|
125
|
-
###
|
|
153
|
+
### RepoWatcherConfig
|
|
126
154
|
|
|
127
|
-
|
|
155
|
+
| Field | Type | Required | Description |
|
|
156
|
+
|-------|------|----------|-------------|
|
|
157
|
+
| `stateKey` | `string` | Yes | Key for state persistence |
|
|
158
|
+
| `stateDirectory` | `string` | Yes | Directory for state files |
|
|
159
|
+
| `run` | `(owner, name) => Promise<TResult>` | Yes | Processing logic |
|
|
160
|
+
| `onComplete` | `(owner, name, result, sha) => void` | No | Success callback |
|
|
161
|
+
| `onError` | `(owner, name, error) => void` | No | Failure callback |
|
|
162
|
+
| `autoSaveMs` | `number` | No | `5000` (5s) |
|
|
163
|
+
| `maxAttempts` | `number` | No | `1` (no retry) |
|
|
128
164
|
|
|
129
|
-
|
|
130
|
-
|--------|---------|
|
|
131
|
-
| `ensureReady?(owner, repo)` | Initialize store (e.g., clone/pull repo) |
|
|
132
|
-
| `getFileManifest(owner, repo)` | Retrieve stored file SHAs |
|
|
133
|
-
| `getDirSha(owner, repo, dirPath)` | Retrieve stored directory SHA |
|
|
134
|
-
| `writeFileResult(owner, repo, path, sha, result)` | Persist file result |
|
|
135
|
-
| `writeDirResult(owner, repo, path, sha, result)` | Persist directory result |
|
|
136
|
-
| `deleteFileResult(owner, repo, path)` | Remove deleted file result |
|
|
137
|
-
| `commitBatch(owner, repo, count)` | Commit batch of changes |
|
|
165
|
+
## GitYamlStore: YAML Persistence
|
|
138
166
|
|
|
139
|
-
|
|
167
|
+
`GitYamlStore` implements `ProcessorStore` by persisting results as YAML files in a git repository.
|
|
140
168
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
Passed to `processFile`:
|
|
169
|
+
```typescript
|
|
170
|
+
import { GitYamlStore } from "@hardlydifficult/repo-processor";
|
|
144
171
|
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
172
|
+
const store = new GitYamlStore({
|
|
173
|
+
cloneUrl: "https://github.com/owner/repo.git",
|
|
174
|
+
localPath: "/tmp/store",
|
|
175
|
+
resultDir: (owner, repo) => `results/${owner}/${repo}`,
|
|
176
|
+
authToken: process.env.GITHUB_TOKEN, // optional, falls back to env
|
|
177
|
+
gitUser: { name: "Processor", email: "bot@example.com" },
|
|
178
|
+
});
|
|
179
|
+
```
|
|
149
180
|
|
|
150
|
-
|
|
181
|
+
### Typed Result Loading
|
|
151
182
|
|
|
152
|
-
|
|
183
|
+
```typescript
|
|
184
|
+
import { z } from "zod";
|
|
153
185
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
| `children` | Immediate children (files and directories) |
|
|
160
|
-
| `tree` | Full tree slice for the directory |
|
|
186
|
+
const fileSchema = z.object({
|
|
187
|
+
path: z.string(),
|
|
188
|
+
lineCount: z.number(),
|
|
189
|
+
sha: z.string(),
|
|
190
|
+
});
|
|
161
191
|
|
|
162
|
-
|
|
192
|
+
const dirSchema = z.object({
|
|
193
|
+
path: z.string(),
|
|
194
|
+
files: z.number(),
|
|
195
|
+
dirs: z.number(),
|
|
196
|
+
sha: z.string(),
|
|
197
|
+
});
|
|
163
198
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
| `processFile` | `(ctx: FileContext) => Promise<unknown>` | Process a single file |
|
|
168
|
-
| `processDirectory` | `(ctx: DirectoryContext) => Promise<unknown>` | Process directory after all children |
|
|
199
|
+
const fileResult = await store.loadFileResult("owner", "repo", "src/index.ts", fileSchema);
|
|
200
|
+
const dirResult = await store.loadDirResult("owner", "repo", "src", dirSchema);
|
|
201
|
+
```
|
|
169
202
|
|
|
170
|
-
|
|
203
|
+
## resolveStaleDirectories: Stale Directory Resolution
|
|
171
204
|
|
|
172
|
-
|
|
205
|
+
`resolveStaleDirectories` determines which directories need reprocessing by combining SHA-based detection with diff-derived stale directories.
|
|
173
206
|
|
|
174
207
|
```typescript
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
208
|
+
import { resolveStaleDirectories } from "@hardlydifficult/repo-processor";
|
|
209
|
+
|
|
210
|
+
const staleDirs = await resolveStaleDirectories(
|
|
211
|
+
owner,
|
|
212
|
+
repo,
|
|
213
|
+
staleDirsFromDiff,
|
|
214
|
+
allFilePaths,
|
|
215
|
+
tree,
|
|
216
|
+
store
|
|
217
|
+
);
|
|
182
218
|
```
|
|
183
219
|
|
|
184
|
-
|
|
220
|
+
### Algorithm
|
|
221
|
+
|
|
222
|
+
- All directories derived from file paths (and root `""`) are checked
|
|
223
|
+
- A directory is stale if:
|
|
224
|
+
- Its stored SHA is missing, or
|
|
225
|
+
- Its stored SHA differs from the current tree SHA
|
|
226
|
+
- Stale directories from diff (e.g., due to file changes) are also included
|
|
185
227
|
|
|
186
|
-
|
|
228
|
+
## ProcessorStore Interface
|
|
187
229
|
|
|
188
230
|
```typescript
|
|
189
|
-
interface
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
231
|
+
interface ProcessorStore {
|
|
232
|
+
ensureReady?(owner: string, repo: string): Promise<void>;
|
|
233
|
+
getFileManifest(owner: string, repo: string): Promise<FileManifest>;
|
|
234
|
+
getDirSha(owner: string, repo: string, dirPath: string): Promise<string | null>;
|
|
235
|
+
writeFileResult(owner: string, repo: string, path: string, sha: string, result: unknown): Promise<void>;
|
|
236
|
+
writeDirResult(owner: string, repo: string, path: string, sha: string, result: unknown): Promise<void>;
|
|
237
|
+
deleteFileResult(owner: string, repo: string, path: string): Promise<void>;
|
|
238
|
+
commitBatch(owner: string, repo: string, count: number): Promise<void>;
|
|
193
239
|
}
|
|
194
240
|
```
|
|
195
241
|
|
|
196
|
-
##
|
|
242
|
+
## ProcessorCallbacks Interface
|
|
197
243
|
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
8. **Process directories bottom-up** – Processes deepest directories first
|
|
206
|
-
9. **Commit** – Finalizes all changes to the git store
|
|
207
|
-
|
|
208
|
-
## Error Handling
|
|
244
|
+
```typescript
|
|
245
|
+
interface ProcessorCallbacks {
|
|
246
|
+
shouldProcess(entry: TreeEntry): boolean;
|
|
247
|
+
processFile(ctx: FileContext): Promise<unknown>;
|
|
248
|
+
processDirectory(ctx: DirectoryContext): Promise<unknown>;
|
|
249
|
+
}
|
|
250
|
+
```
|
|
209
251
|
|
|
210
|
-
|
|
211
|
-
- Failed file processing stops the pipeline immediately with a summary
|
|
212
|
-
- Directory processing continues on individual failures but fails fast overall
|
|
252
|
+
## Progress Reporting
|
|
213
253
|
|
|
214
|
-
|
|
254
|
+
```typescript
|
|
255
|
+
interface ProcessingProgress {
|
|
256
|
+
phase: "loading" | "files" | "directories" | "committing";
|
|
257
|
+
message: string;
|
|
258
|
+
filesTotal: number;
|
|
259
|
+
filesCompleted: number;
|
|
260
|
+
dirsTotal: number;
|
|
261
|
+
dirsCompleted: number;
|
|
262
|
+
}
|
|
215
263
|
|
|
216
|
-
|
|
264
|
+
type ProgressCallback = (progress: ProcessingProgress) => void;
|
|
265
|
+
```
|
|
217
266
|
|
|
218
|
-
|
|
219
|
-
- Their stored SHA differs from the current tree SHA, or
|
|
220
|
-
- They have no stored SHA (first run)
|
|
267
|
+
## Setup
|
|
221
268
|
|
|
222
|
-
|
|
269
|
+
No external service setup beyond GitHub is required. The package uses `@hardlydifficult/github` for tree fetches and `simple-git` for git operations.
|
|
223
270
|
|
|
224
|
-
###
|
|
271
|
+
### Environment Variables
|
|
225
272
|
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
- Batches commit to the store individually for progress durability
|
|
273
|
+
| Variable | Usage |
|
|
274
|
+
|----------|-------|
|
|
275
|
+
| `GITHUB_TOKEN` | Used by `GitYamlStore` for authenticated git operations if `authToken` not provided |
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@hardlydifficult/repo-processor",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.44",
|
|
4
4
|
"main": "./dist/index.js",
|
|
5
5
|
"types": "./dist/index.d.ts",
|
|
6
6
|
"files": [
|
|
@@ -15,19 +15,19 @@
|
|
|
15
15
|
"clean": "rm -rf dist"
|
|
16
16
|
},
|
|
17
17
|
"dependencies": {
|
|
18
|
-
"@hardlydifficult/collections": "1.0.
|
|
18
|
+
"@hardlydifficult/collections": "1.0.7",
|
|
19
19
|
"@hardlydifficult/github": "1.0.27",
|
|
20
|
-
"@hardlydifficult/state-tracker": "2.0.
|
|
21
|
-
"@hardlydifficult/text": "1.0.
|
|
20
|
+
"@hardlydifficult/state-tracker": "2.0.17",
|
|
21
|
+
"@hardlydifficult/text": "1.0.26",
|
|
22
22
|
"simple-git": "3.31.1",
|
|
23
23
|
"yaml": "2.8.2",
|
|
24
24
|
"zod": "4.3.6"
|
|
25
25
|
},
|
|
26
26
|
"peerDependencies": {
|
|
27
|
-
"@hardlydifficult/collections": "1.0.
|
|
27
|
+
"@hardlydifficult/collections": "1.0.7",
|
|
28
28
|
"@hardlydifficult/github": "1.0.27",
|
|
29
|
-
"@hardlydifficult/state-tracker": "2.0.
|
|
30
|
-
"@hardlydifficult/text": "1.0.
|
|
29
|
+
"@hardlydifficult/state-tracker": "2.0.17",
|
|
30
|
+
"@hardlydifficult/text": "1.0.26"
|
|
31
31
|
},
|
|
32
32
|
"devDependencies": {
|
|
33
33
|
"@types/node": "25.2.3",
|