@hardlydifficult/repo-processor 1.0.33 → 1.0.35
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +162 -106
- package/package.json +3 -3
package/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# @hardlydifficult/repo-processor
|
|
2
2
|
|
|
3
|
-
Incremental GitHub repository processor
|
|
3
|
+
Incremental GitHub repository processor with SHA-based stale detection, parallel file processing, and bottom-up directory updates.
|
|
4
4
|
|
|
5
5
|
## Installation
|
|
6
6
|
|
|
@@ -10,164 +10,220 @@ npm install @hardlydifficult/repo-processor
|
|
|
10
10
|
|
|
11
11
|
## Quick Start
|
|
12
12
|
|
|
13
|
-
Process all
|
|
13
|
+
Process all changed files and directories in a GitHub repository, persisting results to a git-backed YAML store:
|
|
14
14
|
|
|
15
15
|
```typescript
|
|
16
16
|
import { RepoProcessor, GitYamlStore } from "@hardlydifficult/repo-processor";
|
|
17
17
|
import { GitHubClient } from "@hardlydifficult/github";
|
|
18
18
|
|
|
19
|
-
|
|
19
|
+
const githubClient = new GitHubClient("owner", "repo", "main", process.env.GITHUB_TOKEN!);
|
|
20
20
|
const store = new GitYamlStore({
|
|
21
|
-
cloneUrl: "https://github.com/
|
|
22
|
-
localPath: "
|
|
21
|
+
cloneUrl: "https://github.com/your-org/results-repo.git",
|
|
22
|
+
localPath: "/tmp/results",
|
|
23
23
|
resultDir: (owner, repo) => `${owner}/${repo}`,
|
|
24
|
+
gitUser: { name: "CI Bot", email: "bot@example.com" }
|
|
24
25
|
});
|
|
25
26
|
|
|
26
|
-
// Create GitHub client (requires GITHUB_TOKEN env var or token in constructor)
|
|
27
|
-
const github = new GitHubClient({ authToken: process.env.GITHUB_TOKEN });
|
|
28
|
-
|
|
29
|
-
// Create processor with custom callbacks
|
|
30
27
|
const processor = new RepoProcessor({
|
|
31
|
-
githubClient
|
|
28
|
+
githubClient,
|
|
32
29
|
store,
|
|
33
30
|
callbacks: {
|
|
34
|
-
shouldProcess: (entry) => entry.path.endsWith(".ts"),
|
|
35
|
-
async
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
},
|
|
31
|
+
shouldProcess: (entry) => entry.type === "blob" && entry.path.endsWith(".ts"),
|
|
32
|
+
processFile: async ({ entry, content }) => ({
|
|
33
|
+
path: entry.path,
|
|
34
|
+
sha: entry.sha,
|
|
35
|
+
size: content.length,
|
|
36
|
+
lines: content.split("\n").length
|
|
37
|
+
}),
|
|
38
|
+
processDirectory: async ({ path, children, tree }) => ({
|
|
39
|
+
path,
|
|
40
|
+
childrenCount: children.length,
|
|
41
|
+
totalFiles: tree.filter(e => e.type === "blob").length
|
|
42
|
+
})
|
|
41
43
|
},
|
|
44
|
+
concurrency: 5
|
|
42
45
|
});
|
|
43
46
|
|
|
44
|
-
// Run the processor
|
|
45
47
|
const result = await processor.run("owner", "repo", (progress) => {
|
|
46
48
|
console.log(`${progress.phase}: ${progress.filesCompleted}/${progress.filesTotal} files`);
|
|
47
49
|
});
|
|
50
|
+
|
|
51
|
+
console.log(result);
|
|
52
|
+
// {
|
|
53
|
+
// filesProcessed: 12,
|
|
54
|
+
// filesRemoved: 1,
|
|
55
|
+
// dirsProcessed: 4
|
|
56
|
+
// }
|
|
48
57
|
```
|
|
49
58
|
|
|
50
59
|
## Core Concepts
|
|
51
60
|
|
|
52
61
|
### RepoProcessor
|
|
53
62
|
|
|
54
|
-
The main
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
63
|
+
The main pipeline orchestrates incremental repository processing by:
|
|
64
|
+
- Fetching the file tree from GitHub
|
|
65
|
+
- Comparing against a stored manifest of file SHAs to detect changes
|
|
66
|
+
- Processing changed files in parallel batches
|
|
67
|
+
- Removing files that no longer exist
|
|
68
|
+
- Resolving and processing stale directories bottom-up
|
|
69
|
+
- Committing results to the git-backed store
|
|
61
70
|
|
|
62
|
-
|
|
71
|
+
#### Configuration
|
|
63
72
|
|
|
64
|
-
|
|
|
65
|
-
|
|
66
|
-
| `githubClient` | GitHub client for
|
|
67
|
-
| `store` |
|
|
68
|
-
| `callbacks` |
|
|
69
|
-
| `concurrency
|
|
70
|
-
| `branch
|
|
73
|
+
| Option | Description | Default |
|
|
74
|
+
|--------|-------------|---------|
|
|
75
|
+
| `githubClient` | GitHub client for tree and file access | — |
|
|
76
|
+
| `store` | Persistence layer for file/dir results and manifests | — |
|
|
77
|
+
| `callbacks` | Domain logic for filtering and processing | — |
|
|
78
|
+
| `concurrency` | Max parallel file/dir operations | `5` |
|
|
79
|
+
| `branch` | Git branch to use | `"main"` |
|
|
71
80
|
|
|
72
|
-
###
|
|
81
|
+
### GitYamlStore
|
|
73
82
|
|
|
74
|
-
|
|
83
|
+
A `ProcessorStore` implementation that persists results as YAML files in a git repository.
|
|
75
84
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
| `processFile(ctx)` | Processes a file's content; result saved to store |
|
|
80
|
-
| `processDirectory(ctx)` | Processes a directory after all children are processed |
|
|
85
|
+
**File results** are stored at `<resultDir>/<filePath>.yml`.
|
|
86
|
+
**Directory results** are stored at `<resultDir>/<dirPath>/dir.yml`.
|
|
87
|
+
Each file includes a `sha` field for change detection.
|
|
81
88
|
|
|
82
|
-
|
|
89
|
+
```typescript
|
|
90
|
+
const store = new GitYamlStore({
|
|
91
|
+
cloneUrl: "https://github.com/your-org/results-repo.git",
|
|
92
|
+
localPath: "/tmp/results",
|
|
93
|
+
resultDir: (owner, repo) => `${owner}/${repo}`,
|
|
94
|
+
gitUser: { name: "CI Bot", email: "bot@example.com" }
|
|
95
|
+
});
|
|
83
96
|
|
|
84
|
-
|
|
97
|
+
// Later, load results back
|
|
98
|
+
const validated = await store.loadFileResult("owner", "repo", "src/index.ts", z.object({
|
|
99
|
+
path: z.string(),
|
|
100
|
+
sha: z.string(),
|
|
101
|
+
size: z.number(),
|
|
102
|
+
lines: z.number()
|
|
103
|
+
}));
|
|
104
|
+
```
|
|
85
105
|
|
|
86
|
-
|
|
87
|
-
|-------|------|-------------|
|
|
88
|
-
| `entry` | `TreeEntry` | Tree entry metadata (path, sha, type) |
|
|
89
|
-
| `content` | `string` | Raw file contents from GitHub |
|
|
106
|
+
### resolveStaleDirectories
|
|
90
107
|
|
|
91
|
-
|
|
108
|
+
Determines which directories require reprocessing by combining:
|
|
109
|
+
1. Directories identified as stale by `diffTree` (due to file changes/removals)
|
|
110
|
+
2. Directories whose stored tree SHA doesn’t match the current tree SHA
|
|
92
111
|
|
|
93
|
-
|
|
112
|
+
```typescript
|
|
113
|
+
import { resolveStaleDirectories } from "@hardlydifficult/repo-processor";
|
|
94
114
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
115
|
+
const staleDirs = await resolveStaleDirectories(
|
|
116
|
+
"owner",
|
|
117
|
+
"repo",
|
|
118
|
+
diff.staleDirs, // dirs flagged by diffTree
|
|
119
|
+
allFilePaths, // current file paths in tree
|
|
120
|
+
tree, // full tree array
|
|
121
|
+
store // store for SHA comparison
|
|
122
|
+
);
|
|
123
|
+
```
|
|
102
124
|
|
|
103
|
-
###
|
|
125
|
+
### Store Interface
|
|
104
126
|
|
|
105
|
-
|
|
127
|
+
The `ProcessorStore` interface defines the contract for persistence implementations:
|
|
106
128
|
|
|
107
|
-
|
|
|
108
|
-
|
|
109
|
-
| `
|
|
110
|
-
| `
|
|
111
|
-
| `
|
|
129
|
+
| Method | Purpose |
|
|
130
|
+
|--------|---------|
|
|
131
|
+
| `ensureReady?(owner, repo)` | Initialize store (e.g., clone/pull repo) |
|
|
132
|
+
| `getFileManifest(owner, repo)` | Retrieve stored file SHAs |
|
|
133
|
+
| `getDirSha(owner, repo, dirPath)` | Retrieve stored directory SHA |
|
|
134
|
+
| `writeFileResult(owner, repo, path, sha, result)` | Persist file result |
|
|
135
|
+
| `writeDirResult(owner, repo, path, sha, result)` | Persist directory result |
|
|
136
|
+
| `deleteFileResult(owner, repo, path)` | Remove deleted file result |
|
|
137
|
+
| `commitBatch(owner, repo, count)` | Commit batch of changes |
|
|
112
138
|
|
|
113
|
-
###
|
|
139
|
+
### Contexts and Callbacks
|
|
114
140
|
|
|
115
|
-
|
|
141
|
+
#### `FileContext`
|
|
116
142
|
|
|
117
|
-
|
|
118
|
-
const store = new GitYamlStore({
|
|
119
|
-
cloneUrl: "https://github.com/owner/repo.git",
|
|
120
|
-
localPath: "./results",
|
|
121
|
-
resultDir: (owner, repo) => `${owner}/${repo}`,
|
|
122
|
-
gitUser: { name: "your-bot-name", email: "your-bot@users.noreply.github.com" },
|
|
123
|
-
authToken: "optional; falls back to GITHUB_TOKEN",
|
|
124
|
-
});
|
|
125
|
-
```
|
|
143
|
+
Passed to `processFile`:
|
|
126
144
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
145
|
+
| Field | Description |
|
|
146
|
+
|-------|-------------|
|
|
147
|
+
| `entry` | Tree entry for the file |
|
|
148
|
+
| `content` | File content as string |
|
|
130
149
|
|
|
131
|
-
|
|
150
|
+
#### `DirectoryContext`
|
|
132
151
|
|
|
133
|
-
|
|
134
|
-
1. Directories identified as stale via `diffTree` (changed/removed children)
|
|
135
|
-
2. Directories whose stored SHA differs from the current tree SHA
|
|
152
|
+
Passed to `processDirectory`:
|
|
136
153
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
store
|
|
145
|
-
);
|
|
146
|
-
```
|
|
154
|
+
| Field | Description |
|
|
155
|
+
|-------|-------------|
|
|
156
|
+
| `path` | Directory path (`""` for root) |
|
|
157
|
+
| `sha` | Tree SHA for the directory |
|
|
158
|
+
| `subtreeFilePaths` | All file paths under this directory |
|
|
159
|
+
| `children` | Immediate children (files and directories) |
|
|
160
|
+
| `tree` | Full tree slice for the directory |
|
|
147
161
|
|
|
148
|
-
|
|
162
|
+
#### `ProcessorCallbacks`
|
|
149
163
|
|
|
150
|
-
|
|
164
|
+
| Method | Signature | Purpose |
|
|
165
|
+
|--------|-----------|---------|
|
|
166
|
+
| `shouldProcess` | `(entry: TreeEntry) => boolean` | Filter which entries to process |
|
|
167
|
+
| `processFile` | `(ctx: FileContext) => Promise<unknown>` | Process a single file |
|
|
168
|
+
| `processDirectory` | `(ctx: DirectoryContext) => Promise<unknown>` | Process directory after all children |
|
|
169
|
+
|
|
170
|
+
### Progress Reporting
|
|
171
|
+
|
|
172
|
+
The optional `onProgress` callback provides real-time updates:
|
|
151
173
|
|
|
152
174
|
```typescript
|
|
153
|
-
|
|
154
|
-
phase
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
}
|
|
175
|
+
await processor.run("owner", "repo", (progress) => {
|
|
176
|
+
console.log(progress.phase); // "loading" | "files" | "directories" | "committing"
|
|
177
|
+
console.log(progress.filesTotal); // Total files to process
|
|
178
|
+
console.log(progress.filesCompleted);
|
|
179
|
+
console.log(progress.dirsTotal); // Total directories to process
|
|
180
|
+
console.log(progress.dirsCompleted);
|
|
181
|
+
});
|
|
161
182
|
```
|
|
162
183
|
|
|
163
|
-
## Result
|
|
184
|
+
## Processing Result
|
|
164
185
|
|
|
165
|
-
`
|
|
186
|
+
The `run()` method returns:
|
|
166
187
|
|
|
167
188
|
```typescript
|
|
168
189
|
interface ProcessingResult {
|
|
169
|
-
filesProcessed: number;
|
|
170
|
-
filesRemoved: number;
|
|
171
|
-
dirsProcessed: number;
|
|
190
|
+
filesProcessed: number; // Files processed (including updates)
|
|
191
|
+
filesRemoved: number; // Files deleted
|
|
192
|
+
dirsProcessed: number; // Directories processed
|
|
172
193
|
}
|
|
173
|
-
```
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
## Pipeline Stages
|
|
197
|
+
|
|
198
|
+
1. **Init store** – Calls `ensureReady()` if implemented
|
|
199
|
+
2. **Fetch tree** – Retrieves full file tree from GitHub
|
|
200
|
+
3. **Filter** – Applies `shouldProcess` to file entries
|
|
201
|
+
4. **Diff** – Compares current manifest with stored manifest
|
|
202
|
+
5. **Process files** – Fetches content, calls `processFile`, persists results
|
|
203
|
+
6. **Remove files** – Deletes results for removed files
|
|
204
|
+
7. **Resolve stale directories** – Uses SHA mismatch detection
|
|
205
|
+
8. **Process directories bottom-up** – Processes deepest directories first
|
|
206
|
+
9. **Commit** – Finalizes all changes to the git store
|
|
207
|
+
|
|
208
|
+
## Error Handling
|
|
209
|
+
|
|
210
|
+
- File and directory errors are aggregated and reported with full path details
|
|
211
|
+
- Failed file processing stops the pipeline immediately with a summary
|
|
212
|
+
- Directory processing continues on individual failures but fails fast overall
|
|
213
|
+
|
|
214
|
+
## Appendices
|
|
215
|
+
|
|
216
|
+
### SHA-Based Stale Detection
|
|
217
|
+
|
|
218
|
+
Directories are marked stale when:
|
|
219
|
+
- Their stored SHA differs from the current tree SHA, or
|
|
220
|
+
- They have no stored SHA (first run)
|
|
221
|
+
|
|
222
|
+
This enables recovery after partial failures and catches directories whose tree SHA changed without file changes.
|
|
223
|
+
|
|
224
|
+
### Parallel Processing
|
|
225
|
+
|
|
226
|
+
Files and directories are processed in batches controlled by `concurrency`:
|
|
227
|
+
- Files are grouped into batches and processed in parallel
|
|
228
|
+
- Directories are grouped by depth and processed bottom-up within each depth
|
|
229
|
+
- Batches commit to the store individually for progress durability
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@hardlydifficult/repo-processor",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.35",
|
|
4
4
|
"main": "./dist/index.js",
|
|
5
5
|
"types": "./dist/index.d.ts",
|
|
6
6
|
"files": [
|
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
"clean": "rm -rf dist"
|
|
16
16
|
},
|
|
17
17
|
"dependencies": {
|
|
18
|
-
"@hardlydifficult/collections": "1.0.
|
|
18
|
+
"@hardlydifficult/collections": "1.0.6",
|
|
19
19
|
"@hardlydifficult/github": "1.0.27",
|
|
20
20
|
"@hardlydifficult/text": "1.0.23",
|
|
21
21
|
"simple-git": "3.31.1",
|
|
@@ -23,7 +23,7 @@
|
|
|
23
23
|
"zod": "4.3.6"
|
|
24
24
|
},
|
|
25
25
|
"peerDependencies": {
|
|
26
|
-
"@hardlydifficult/collections": "1.0.
|
|
26
|
+
"@hardlydifficult/collections": "1.0.6",
|
|
27
27
|
"@hardlydifficult/github": "1.0.27",
|
|
28
28
|
"@hardlydifficult/text": "1.0.23"
|
|
29
29
|
},
|