h56-github-scrapper 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/main-scrapping.js +684 -0
- package/package.json +23 -0
- package/readme.md +320 -0
- package/script/ensure-external-deps.js +55 -0
- package/translate-engine/translate.js +35 -0
- package/translate-engine/translate.ts +72 -0
|
@@ -0,0 +1,684 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* main-scrapping.js
|
|
3
|
+
*
|
|
4
|
+
* NPM package style single-file scraper for GitHub user data.
|
|
5
|
+
* Package name suggestion: h56-github-scrapper
|
|
6
|
+
*
|
|
7
|
+
* Features:
|
|
8
|
+
* - Exports programmatic functions: scrapeProfile, scrapeRepos, scrapeUser, calculateStats
|
|
9
|
+
* - CLI entry when run directly: node main-scrapping.js <username> [--json] [--output=file]
|
|
10
|
+
* - Optional translator integration (h56-translator) with selectable fields to translate
|
|
11
|
+
* - Automatic install of missing npm runtime dependencies (asks for consent when needed)
|
|
12
|
+
* - Robust retry/backoff, polite scraping delay, spinner (ora) fallback
|
|
13
|
+
* - Well-structured results and JSON output support
|
|
14
|
+
*
|
|
15
|
+
* NOTE:
|
|
16
|
+
* - The recommended (proper) approach for publishing as an npm package is to list dependencies
|
|
17
|
+
* in package.json. The runtime auto-installer is implemented as a convenience only.
|
|
18
|
+
* - Scraping HTML may break if GitHub changes markup. Consider using GitHub API for production.
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
const fs = require("fs");
|
|
22
|
+
const path = require("path");
|
|
23
|
+
const { spawnSync } = require("child_process");
|
|
24
|
+
const os = require("os");
|
|
25
|
+
|
|
26
|
+
// -------------------------
|
|
27
|
+
// Ensure runtime deps
|
|
28
|
+
// -------------------------
|
|
29
|
+
function ensureDependencies(deps = []) {
|
|
30
|
+
const missing = deps.filter((d) => {
|
|
31
|
+
try {
|
|
32
|
+
require.resolve(d);
|
|
33
|
+
return false;
|
|
34
|
+
} catch (e) {
|
|
35
|
+
return true;
|
|
36
|
+
}
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
if (!missing.length) return;
|
|
40
|
+
|
|
41
|
+
console.log(
|
|
42
|
+
`Dependencies missing: ${missing.join(
|
|
43
|
+
", "
|
|
44
|
+
)}. The script can install them automatically.`
|
|
45
|
+
);
|
|
46
|
+
|
|
47
|
+
// If running non-interactive environment, install automatically.
|
|
48
|
+
let consent = false;
|
|
49
|
+
if (process.env.CI || !process.stdin.isTTY) {
|
|
50
|
+
consent = true;
|
|
51
|
+
console.log("Non-interactive environment detected, installing automatically...");
|
|
52
|
+
} else {
|
|
53
|
+
const rl = require("readline").createInterface({
|
|
54
|
+
input: process.stdin,
|
|
55
|
+
output: process.stdout,
|
|
56
|
+
});
|
|
57
|
+
|
|
58
|
+
const answer = new Promise((resolve) =>
|
|
59
|
+
rl.question(`Install missing dependencies now? (Y/n): `, (a) => {
|
|
60
|
+
rl.close();
|
|
61
|
+
resolve(a.trim().toLowerCase());
|
|
62
|
+
})
|
|
63
|
+
);
|
|
64
|
+
|
|
65
|
+
// synchronous wait via spawnSync is not used here; use await-like pattern:
|
|
66
|
+
// To keep file runnable without top-level await, use de-facto blocking by reading from stdin synchronously not available.
|
|
67
|
+
// We'll perform synchronous prompt by checking answer resolved synchronously above (it won't), so instead do a simpler default:
|
|
68
|
+
// If user doesn't answer quickly, default to yes after 10s.
|
|
69
|
+
// Implementing a small blocking wait using child_process to call 'bash -c read -t 10' is platform-dependent.
|
|
70
|
+
// For simplicity here, assume consent if user presses Enter quickly — we'll read synchronously via question with callback, then block via a small busy loop until resolved.
|
|
71
|
+
let resolved = false;
|
|
72
|
+
let ansValue = "";
|
|
73
|
+
answer.then((v) => {
|
|
74
|
+
resolved = true;
|
|
75
|
+
ansValue = v;
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
// Wait (busy-loop) until resolved — acceptable for a small prompt in CLI tool.
|
|
79
|
+
const waitUntil = Date.now() + 10000; // 10s timeout
|
|
80
|
+
while (!resolved && Date.now() < waitUntil) {
|
|
81
|
+
// small sleep
|
|
82
|
+
const start = Date.now();
|
|
83
|
+
while (Date.now() - start < 50) {}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
if (!resolved) {
|
|
87
|
+
// default to yes
|
|
88
|
+
consent = true;
|
|
89
|
+
console.log("\nNo answer, defaulting to install.\n");
|
|
90
|
+
} else {
|
|
91
|
+
consent = !ansValue || ansValue === "y" || ansValue === "yes";
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
if (!consent) {
|
|
96
|
+
console.error("Cannot proceed without required dependencies. Exiting.");
|
|
97
|
+
process.exit(1);
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// Run npm install for missing deps
|
|
101
|
+
console.log(`Installing: ${missing.join(" ")} ...`);
|
|
102
|
+
const args = ["install", "--save", ...missing];
|
|
103
|
+
const result = spawnSync(process.platform === "win32" ? "npm.cmd" : "npm", args, {
|
|
104
|
+
stdio: "inherit",
|
|
105
|
+
shell: false,
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
if (result.error || result.status !== 0) {
|
|
109
|
+
console.error("Automatic installation failed. Please run:");
|
|
110
|
+
console.error(` npm install ${missing.join(" ")}`);
|
|
111
|
+
process.exit(1);
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
console.log("Dependencies installed, continuing...");
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// required runtime modules
|
|
118
|
+
ensureDependencies(["axios", "cheerio", "ora", "yargs"]);
|
|
119
|
+
|
|
120
|
+
// now require installed modules
|
|
121
|
+
const axios = require("axios");
|
|
122
|
+
const cheerio = require("cheerio");
|
|
123
|
+
const ora = require("ora");
|
|
124
|
+
const yargs = require("yargs");
|
|
125
|
+
|
|
126
|
+
// -------------------------
|
|
127
|
+
// Utilities & Config
|
|
128
|
+
// -------------------------
|
|
129
|
+
const DEFAULT_CONFIG = {
|
|
130
|
+
BASE_URL: "https://github.com",
|
|
131
|
+
REQUEST_TIMEOUT: 15000,
|
|
132
|
+
MAX_RETRY: 3,
|
|
133
|
+
SCRAPE_DELAY: 400, // ms between page fetches
|
|
134
|
+
USER_AGENT:
|
|
135
|
+
"Mozilla/5.0 (compatible; h56-github-scrapper/1.0; +https://github.com/)",
|
|
136
|
+
PER_PAGE: 30,
|
|
137
|
+
};
|
|
138
|
+
|
|
139
|
+
function sleep(ms) {
|
|
140
|
+
return new Promise((r) => setTimeout(r, ms));
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
function parseGithubNumber(text = "") {
|
|
144
|
+
if (!text) return 0;
|
|
145
|
+
text = String(text).toLowerCase().replace(/\s+/g, "").replace(/,/g, "");
|
|
146
|
+
// common cases: 1k 1.2k 1.2m 1,200 1200
|
|
147
|
+
const m = text.match(/^([\d,.]*\d)([km])?$/);
|
|
148
|
+
if (!m) {
|
|
149
|
+
const n = Number(text.replace(/[^0-9.]/g, ""));
|
|
150
|
+
return Number.isFinite(n) ? Math.round(n) : 0;
|
|
151
|
+
}
|
|
152
|
+
const val = parseFloat(m[1]);
|
|
153
|
+
if (m[2] === "k") return Math.round(val * 1000);
|
|
154
|
+
if (m[2] === "m") return Math.round(val * 1000000);
|
|
155
|
+
return Math.round(val);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
function formatNumber(num) {
|
|
159
|
+
try {
|
|
160
|
+
return new Intl.NumberFormat("en-US").format(num);
|
|
161
|
+
} catch (e) {
|
|
162
|
+
return String(num);
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
function validateUsername(username) {
|
|
167
|
+
return /^[a-zA-Z0-9-]{1,39}$/.test(username);
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// -------------------------
|
|
171
|
+
// Translator loader (optional)
|
|
172
|
+
// -------------------------
|
|
173
|
+
// Provide a safe, synchronous attempt to load the local CommonJS wrapper
|
|
174
|
+
// translate-engine/translate.js. If not present, translatorFn will be null.
|
|
175
|
+
// Consumers of translation should handle null translatorFn.
|
|
176
|
+
let translatorFn = null;
|
|
177
|
+
try {
|
|
178
|
+
// prefer the local wrapper shipped with the package (does dynamic require to h56-translator)
|
|
179
|
+
const tmod = require("./translate-engine/translate");
|
|
180
|
+
if (tmod && typeof tmod.translate === "function") {
|
|
181
|
+
translatorFn = tmod.translate;
|
|
182
|
+
}
|
|
183
|
+
} catch (e) {
|
|
184
|
+
// not installed / wrapper not available; keep translatorFn null
|
|
185
|
+
translatorFn = null;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
// -------------------------
|
|
189
|
+
// Scraper class
|
|
190
|
+
// -------------------------
|
|
191
|
+
class GithubScraper {
|
|
192
|
+
constructor(opts = {}) {
|
|
193
|
+
this.config = { ...DEFAULT_CONFIG, ...(opts || {}) };
|
|
194
|
+
this.axios = axios.create({
|
|
195
|
+
timeout: this.config.REQUEST_TIMEOUT,
|
|
196
|
+
headers: { "User-Agent": this.config.USER_AGENT, Accept: "text/html" },
|
|
197
|
+
});
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
async requestWithRetry(url, attempt = 1) {
|
|
201
|
+
try {
|
|
202
|
+
const res = await this.axios.get(url);
|
|
203
|
+
return res.data;
|
|
204
|
+
} catch (err) {
|
|
205
|
+
if (attempt >= this.config.MAX_RETRY) {
|
|
206
|
+
// wrap error with url info
|
|
207
|
+
const e = new Error(`Failed to fetch ${url}: ${err.message}`);
|
|
208
|
+
e.cause = err;
|
|
209
|
+
throw e;
|
|
210
|
+
}
|
|
211
|
+
// backoff
|
|
212
|
+
await sleep(1000 * attempt);
|
|
213
|
+
return this.requestWithRetry(url, attempt + 1);
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
async fetchPage(url) {
|
|
218
|
+
const html = await this.requestWithRetry(url);
|
|
219
|
+
return cheerio.load(html);
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
async scrapeProfile(username) {
|
|
223
|
+
const $ = await this.fetchPage(`${this.config.BASE_URL}/${username}`);
|
|
224
|
+
|
|
225
|
+
if ($("title").text().includes("Not Found")) {
|
|
226
|
+
const e = new Error("Username not found");
|
|
227
|
+
e.code = "NOT_FOUND";
|
|
228
|
+
throw e;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
const name =
|
|
232
|
+
$('h1[class*="vcard-names"] .p-name').text().trim() ||
|
|
233
|
+
$(".p-name.vcard-fullname").text().trim() ||
|
|
234
|
+
"";
|
|
235
|
+
|
|
236
|
+
const bio =
|
|
237
|
+
$('div[class*="p-note"]').text().trim() ||
|
|
238
|
+
$('div[itemprop="description"]').text().trim() ||
|
|
239
|
+
"";
|
|
240
|
+
|
|
241
|
+
// Pulled from header counters (structure may vary by locale/markup)
|
|
242
|
+
const followersText = $(
|
|
243
|
+
'a[href$="?tab=followers"], a[href$="?tab=followers"] .text-bold'
|
|
244
|
+
)
|
|
245
|
+
.first()
|
|
246
|
+
.text()
|
|
247
|
+
.trim();
|
|
248
|
+
const followingText = $(
|
|
249
|
+
'a[href$="?tab=following"], a[href$="?tab=following"] .text-bold'
|
|
250
|
+
)
|
|
251
|
+
.first()
|
|
252
|
+
.text()
|
|
253
|
+
.trim();
|
|
254
|
+
const reposText = $(
|
|
255
|
+
'a[href$="?tab=repositories"], a[href$="?tab=repositories"] .Counter'
|
|
256
|
+
)
|
|
257
|
+
.first()
|
|
258
|
+
.text()
|
|
259
|
+
.trim();
|
|
260
|
+
|
|
261
|
+
const followers = parseGithubNumber(followersText);
|
|
262
|
+
const following = parseGithubNumber(followingText);
|
|
263
|
+
const public_repos = parseGithubNumber(reposText);
|
|
264
|
+
|
|
265
|
+
return {
|
|
266
|
+
username,
|
|
267
|
+
name,
|
|
268
|
+
bio,
|
|
269
|
+
followers,
|
|
270
|
+
following,
|
|
271
|
+
public_repos,
|
|
272
|
+
profile_url: `${this.config.BASE_URL}/${username}`,
|
|
273
|
+
};
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
async scrapeRepos(username) {
|
|
277
|
+
const repos = [];
|
|
278
|
+
let page = 1;
|
|
279
|
+
while (true) {
|
|
280
|
+
const url = `${this.config.BASE_URL}/${username}?page=${page}&tab=repositories`;
|
|
281
|
+
const $ = await this.fetchPage(url);
|
|
282
|
+
// older layout: li[itemprop='owns'], new layout: div[id^=user-repositories-list] li
|
|
283
|
+
const repoItems =
|
|
284
|
+
$("li[itemprop='owns']").length > 0
|
|
285
|
+
? $("li[itemprop='owns']")
|
|
286
|
+
: $("#user-repositories-list li");
|
|
287
|
+
|
|
288
|
+
if (!repoItems.length) break;
|
|
289
|
+
|
|
290
|
+
repoItems.each((_, el) => {
|
|
291
|
+
const el$ = $(el);
|
|
292
|
+
const repoName =
|
|
293
|
+
el$.find("a[itemprop='name codeRepository']").text().trim() ||
|
|
294
|
+
el$.find("h3 a").text().trim();
|
|
295
|
+
|
|
296
|
+
const starText =
|
|
297
|
+
el$.find("a[href$='/stargazers']").text().trim() ||
|
|
298
|
+
el$.find("svg[aria-label='star'] + span").text().trim();
|
|
299
|
+
const forkText =
|
|
300
|
+
el$.find("a[href$='/network/members']").text().trim() ||
|
|
301
|
+
el$.find("svg[aria-label='fork'] + span").text().trim();
|
|
302
|
+
|
|
303
|
+
const language =
|
|
304
|
+
el$.find("[itemprop='programmingLanguage']").text().trim() ||
|
|
305
|
+
el$.find(".repo-language-color + span").text().trim() ||
|
|
306
|
+
"Unknown";
|
|
307
|
+
|
|
308
|
+
const description =
|
|
309
|
+
el$.find("p[itemprop='description']").text().trim() ||
|
|
310
|
+
el$.find("p.col-9").text().trim() ||
|
|
311
|
+
"";
|
|
312
|
+
|
|
313
|
+
const updated = el$.find("relative-time").attr("datetime") || "";
|
|
314
|
+
|
|
315
|
+
repos.push({
|
|
316
|
+
name: repoName,
|
|
317
|
+
description,
|
|
318
|
+
stars: parseGithubNumber(starText),
|
|
319
|
+
forks: parseGithubNumber(forkText),
|
|
320
|
+
language: language || "Unknown",
|
|
321
|
+
updated_at: updated,
|
|
322
|
+
});
|
|
323
|
+
});
|
|
324
|
+
|
|
325
|
+
page++;
|
|
326
|
+
await sleep(this.config.SCRAPE_DELAY);
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
return repos;
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
calculateStats(repos) {
|
|
333
|
+
const languageMap = {};
|
|
334
|
+
let totalStars = 0;
|
|
335
|
+
let totalForks = 0;
|
|
336
|
+
|
|
337
|
+
repos.forEach((r) => {
|
|
338
|
+
totalStars += r.stars || 0;
|
|
339
|
+
totalForks += r.forks || 0;
|
|
340
|
+
const lang = r.language || "Unknown";
|
|
341
|
+
languageMap[lang] = (languageMap[lang] || 0) + 1;
|
|
342
|
+
});
|
|
343
|
+
|
|
344
|
+
const top_languages = Object.entries(languageMap)
|
|
345
|
+
.sort((a, b) => b[1] - a[1])
|
|
346
|
+
.map(([language, count]) => ({ language, repos: count }));
|
|
347
|
+
|
|
348
|
+
return {
|
|
349
|
+
total_repositories: repos.length,
|
|
350
|
+
total_stars: totalStars,
|
|
351
|
+
total_forks: totalForks,
|
|
352
|
+
top_languages,
|
|
353
|
+
};
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
/**
|
|
357
|
+
* applyTranslations(result, translateOptions)
|
|
358
|
+
*
|
|
359
|
+
* Mutates the result object by adding translated fields.
|
|
360
|
+
*
|
|
361
|
+
* translateOptions (optional) shape:
|
|
362
|
+
* {
|
|
363
|
+
* lang: string, // target language code (required to perform translations)
|
|
364
|
+
* fields?: string[], // list of fields to translate; supported values:
|
|
365
|
+
* // 'bio' (profile.bio)
|
|
366
|
+
* // 'repo_descriptions' (repo.description)
|
|
367
|
+
* // 'repo_names' (repo.name)
|
|
368
|
+
* // 'all_repos' (alias for repo_descriptions + repo_names)
|
|
369
|
+
* // default: ['bio', 'repo_descriptions']
|
|
370
|
+
* perRepoDelay?: number, // ms delay between repo translations (default 120)
|
|
371
|
+
* failOnMissing?: boolean // if true, throw when translator is not available (default false)
|
|
372
|
+
* }
|
|
373
|
+
*/
|
|
374
|
+
async applyTranslations(result, translateOptions = {}) {
|
|
375
|
+
if (!translateOptions || !translateOptions.lang) return result;
|
|
376
|
+
const opts = {
|
|
377
|
+
fields: ["bio", "repo_descriptions"],
|
|
378
|
+
perRepoDelay: 120,
|
|
379
|
+
failOnMissing: false,
|
|
380
|
+
...translateOptions,
|
|
381
|
+
};
|
|
382
|
+
|
|
383
|
+
// normalize fields
|
|
384
|
+
const fields = new Set();
|
|
385
|
+
for (const f of opts.fields) {
|
|
386
|
+
if (f === "all_repos") {
|
|
387
|
+
fields.add("repo_descriptions");
|
|
388
|
+
fields.add("repo_names");
|
|
389
|
+
} else {
|
|
390
|
+
fields.add(f);
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
// translator function must exist
|
|
395
|
+
const tfn = translatorFn;
|
|
396
|
+
if (!tfn) {
|
|
397
|
+
const msg =
|
|
398
|
+
"Optional translator is not available. Install 'h56-translator' (and ensure translate-engine/translate.js is present) to enable translations.";
|
|
399
|
+
if (opts.failOnMissing) {
|
|
400
|
+
const e = new Error(msg);
|
|
401
|
+
e.code = "TRANSLATOR_MISSING";
|
|
402
|
+
throw e;
|
|
403
|
+
} else {
|
|
404
|
+
// attach a note and skip translations
|
|
405
|
+
result._translation_note = {
|
|
406
|
+
skipped: true,
|
|
407
|
+
reason: msg,
|
|
408
|
+
};
|
|
409
|
+
return result;
|
|
410
|
+
}
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
// perform profile translation
|
|
414
|
+
try {
|
|
415
|
+
if (fields.has("bio") && result.profile && result.profile.bio) {
|
|
416
|
+
try {
|
|
417
|
+
const t = await tfn(result.profile.bio, opts.lang, { timeoutMs: 5000 });
|
|
418
|
+
if (t && typeof t.translatedText === "string") {
|
|
419
|
+
result.profile.bio_translated = t.translatedText;
|
|
420
|
+
result.profile.bio_source_lang = t.sourceLang || null;
|
|
421
|
+
result.profile.bio_translation_meta = { serviceStatus: t.serviceStatus || "ok" };
|
|
422
|
+
}
|
|
423
|
+
} catch (e) {
|
|
424
|
+
result.profile.bio_translation_error = e && e.message ? e.message : String(e);
|
|
425
|
+
}
|
|
426
|
+
}
|
|
427
|
+
} catch (e) {
|
|
428
|
+
// defensive: any translator error should not break the main flow
|
|
429
|
+
result._translation_profile_error = e && e.message ? e.message : String(e);
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
// perform repository translations sequentially (safer)
|
|
433
|
+
if (Array.isArray(result.repos) && result.repos.length > 0) {
|
|
434
|
+
for (const repo of result.repos) {
|
|
435
|
+
try {
|
|
436
|
+
if (fields.has("repo_descriptions") && repo.description) {
|
|
437
|
+
try {
|
|
438
|
+
const t = await tfn(repo.description, opts.lang, { timeoutMs: 5000 });
|
|
439
|
+
if (t && typeof t.translatedText === "string") {
|
|
440
|
+
repo.description_translated = t.translatedText;
|
|
441
|
+
repo.description_source_lang = t.sourceLang || null;
|
|
442
|
+
repo.description_translation_meta = { serviceStatus: t.serviceStatus || "ok" };
|
|
443
|
+
}
|
|
444
|
+
} catch (e) {
|
|
445
|
+
repo.description_translation_error = e && e.message ? e.message : String(e);
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
if (fields.has("repo_names") && repo.name) {
|
|
449
|
+
try {
|
|
450
|
+
const t2 = await tfn(repo.name, opts.lang, { timeoutMs: 3000 });
|
|
451
|
+
if (t2 && typeof t2.translatedText === "string") {
|
|
452
|
+
repo.name_translated = t2.translatedText;
|
|
453
|
+
repo.name_source_lang = t2.sourceLang || null;
|
|
454
|
+
repo.name_translation_meta = { serviceStatus: t2.serviceStatus || "ok" };
|
|
455
|
+
}
|
|
456
|
+
} catch (e) {
|
|
457
|
+
repo.name_translation_error = e && e.message ? e.message : String(e);
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
} catch (e) {
|
|
461
|
+
// attach per-repo error but continue
|
|
462
|
+
repo.translation_internal_error = e && e.message ? e.message : String(e);
|
|
463
|
+
}
|
|
464
|
+
await sleep(opts.perRepoDelay);
|
|
465
|
+
}
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
return result;
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
// high-level helper
|
|
472
|
+
/**
|
|
473
|
+
* scrapeUser(username, opts)
|
|
474
|
+
*
|
|
475
|
+
* opts:
|
|
476
|
+
* spinner: boolean (default true)
|
|
477
|
+
* translate: {
|
|
478
|
+
* lang: 'en', // target language code (required to enable translations)
|
|
479
|
+
* fields: ['bio','repo_descriptions'], // which fields to translate
|
|
480
|
+
* perRepoDelay: 120, // ms
|
|
481
|
+
* failOnMissing: false // if true, throw when translator missing
|
|
482
|
+
* }
|
|
483
|
+
*/
|
|
484
|
+
async scrapeUser(username, opts = {}) {
|
|
485
|
+
if (!validateUsername(username)) {
|
|
486
|
+
const e = new Error("Invalid GitHub username format");
|
|
487
|
+
e.code = "INVALID_USERNAME";
|
|
488
|
+
throw e;
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
const spinner = (opts.spinner !== false) ? ora({ text: `Scraping ${username}...`, spinner: "dots" }).start() : null;
|
|
492
|
+
|
|
493
|
+
try {
|
|
494
|
+
const profile = await this.scrapeProfile(username);
|
|
495
|
+
if (spinner) spinner.text = "Fetching repositories...";
|
|
496
|
+
const repos = await this.scrapeRepos(username);
|
|
497
|
+
if (spinner) spinner.succeed("Scraping completed");
|
|
498
|
+
const stats = this.calculateStats(repos);
|
|
499
|
+
|
|
500
|
+
let result = { profile, repos, stats };
|
|
501
|
+
|
|
502
|
+
// If translation options provided, attempt to apply translations.
|
|
503
|
+
if (opts.translate && opts.translate.lang) {
|
|
504
|
+
if (spinner) spinner.text = "Applying translations...";
|
|
505
|
+
try {
|
|
506
|
+
result = await this.applyTranslations(result, opts.translate);
|
|
507
|
+
} catch (e) {
|
|
508
|
+
// translator errors: if failOnMissing requested, rethrow; otherwise attach note
|
|
509
|
+
if (opts.translate && opts.translate.failOnMissing) {
|
|
510
|
+
if (spinner) spinner.fail("Failed");
|
|
511
|
+
throw e;
|
|
512
|
+
} else {
|
|
513
|
+
// attach note and continue
|
|
514
|
+
result._translation_error = e && e.message ? e.message : String(e);
|
|
515
|
+
}
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
return result;
|
|
520
|
+
} catch (err) {
|
|
521
|
+
if (spinner) spinner.fail("Failed");
|
|
522
|
+
throw err;
|
|
523
|
+
}
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
// CLI pretty print
|
|
527
|
+
static printResult(profile, stats, repos = []) {
|
|
528
|
+
console.log("\n========== GITHUB ACCOUNT ==========\n");
|
|
529
|
+
console.log("Username :", profile.username);
|
|
530
|
+
console.log("Name :", profile.name || "-");
|
|
531
|
+
console.log("Bio :", profile.bio || "-");
|
|
532
|
+
if (profile.bio_translated) console.log("Bio (translated):", profile.bio_translated);
|
|
533
|
+
console.log("Followers :", formatNumber(profile.followers));
|
|
534
|
+
console.log("Following :", formatNumber(profile.following));
|
|
535
|
+
console.log("Repos :", formatNumber(profile.public_repos));
|
|
536
|
+
console.log("Profile :", profile.profile_url);
|
|
537
|
+
|
|
538
|
+
console.log("\n------- Repository Statistics -------\n");
|
|
539
|
+
console.log("Total Repository :", formatNumber(stats.total_repositories));
|
|
540
|
+
console.log("Total Stars :", formatNumber(stats.total_stars));
|
|
541
|
+
console.log("Total Forks :", formatNumber(stats.total_forks));
|
|
542
|
+
|
|
543
|
+
console.log("\nTop Languages:");
|
|
544
|
+
stats.top_languages.forEach((l) => console.log(`• ${l.language} (${l.repos})`));
|
|
545
|
+
|
|
546
|
+
if (repos && repos.length) {
|
|
547
|
+
console.log("\nSample repositories:");
|
|
548
|
+
repos.slice(0, 10).forEach((r) =>
|
|
549
|
+
console.log(
|
|
550
|
+
`- ${r.name} (${r.language}) ★${formatNumber(r.stars)} Forks:${formatNumber(
|
|
551
|
+
r.forks
|
|
552
|
+
)}${r.description_translated ? `\n → ${r.description_translated}` : ""}`
|
|
553
|
+
)
|
|
554
|
+
);
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
console.log("\n====================================\n");
|
|
558
|
+
}
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
// -------------------------
|
|
562
|
+
// Exports (for npm usage)
|
|
563
|
+
// -------------------------
|
|
564
|
+
const defaultScraper = new GithubScraper();
|
|
565
|
+
|
|
566
|
+
// Export translator helper (attempt to load; provide helpful fallback if not available)
|
|
567
|
+
let h56translate;
|
|
568
|
+
try {
|
|
569
|
+
const tmod = require("./translate-engine/translate");
|
|
570
|
+
if (tmod && typeof tmod.translate === "function") {
|
|
571
|
+
h56translate = tmod.translate;
|
|
572
|
+
} else {
|
|
573
|
+
h56translate = async function () {
|
|
574
|
+
throw new Error(
|
|
575
|
+
"Optional translator module loaded but export shape is not recognized. Ensure 'h56-translator' is installed and compatible."
|
|
576
|
+
);
|
|
577
|
+
};
|
|
578
|
+
}
|
|
579
|
+
} catch (e) {
|
|
580
|
+
h56translate = async function () {
|
|
581
|
+
throw new Error(
|
|
582
|
+
"Optional translator is not available. Install it with `npm install h56-translator` or run `npm install` to trigger postinstall."
|
|
583
|
+
);
|
|
584
|
+
};
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
module.exports = {
|
|
588
|
+
GithubScraper,
|
|
589
|
+
defaultScraper,
|
|
590
|
+
scrapeProfile: (username, opts) => defaultScraper.scrapeProfile(username, opts),
|
|
591
|
+
scrapeRepos: (username, opts) => defaultScraper.scrapeRepos(username, opts),
|
|
592
|
+
scrapeUser: (username, opts) => defaultScraper.scrapeUser(username, opts),
|
|
593
|
+
calculateStats: (repos) => defaultScraper.calculateStats(repos),
|
|
594
|
+
printResult: GithubScraper.printResult,
|
|
595
|
+
// new optional helper: h56translate(text, targetLang, options?)
|
|
596
|
+
h56translate,
|
|
597
|
+
};
|
|
598
|
+
|
|
599
|
+
// -------------------------
|
|
600
|
+
// CLI behavior when run directly
|
|
601
|
+
// -------------------------
|
|
602
|
+
if (require.main === module) {
|
|
603
|
+
(async () => {
|
|
604
|
+
const argv = yargs(process.argv.slice(2))
|
|
605
|
+
.usage("Usage: $0 <username> [options]")
|
|
606
|
+
.option("json", {
|
|
607
|
+
alias: "j",
|
|
608
|
+
type: "boolean",
|
|
609
|
+
description: "Output raw JSON",
|
|
610
|
+
})
|
|
611
|
+
.option("output", {
|
|
612
|
+
alias: "o",
|
|
613
|
+
type: "string",
|
|
614
|
+
description: "Write JSON output to file",
|
|
615
|
+
})
|
|
616
|
+
.option("no-spinner", {
|
|
617
|
+
type: "boolean",
|
|
618
|
+
description: "Disable spinner output",
|
|
619
|
+
})
|
|
620
|
+
.option("lang", {
|
|
621
|
+
alias: "l",
|
|
622
|
+
type: "string",
|
|
623
|
+
description: "Optional: translate selected text fields to this language (e.g. en, id)",
|
|
624
|
+
})
|
|
625
|
+
.option("translate-fields", {
|
|
626
|
+
type: "string",
|
|
627
|
+
description:
|
|
628
|
+
"Comma-separated fields to translate (bio,repo_descriptions,repo_names,all_repos). Default: bio,repo_descriptions",
|
|
629
|
+
})
|
|
630
|
+
.demandCommand(1, "Github username is required")
|
|
631
|
+
.help().argv;
|
|
632
|
+
|
|
633
|
+
const username = argv._[0];
|
|
634
|
+
|
|
635
|
+
if (!validateUsername(username)) {
|
|
636
|
+
console.error("Invalid GitHub username format.");
|
|
637
|
+
process.exit(1);
|
|
638
|
+
}
|
|
639
|
+
|
|
640
|
+
// build translate options if requested
|
|
641
|
+
const translateOpt = argv.lang
|
|
642
|
+
? {
|
|
643
|
+
lang: argv.lang,
|
|
644
|
+
fields: argv["translate-fields"]
|
|
645
|
+
? argv["translate-fields"].split(",").map((s) => s.trim())
|
|
646
|
+
: undefined,
|
|
647
|
+
perRepoDelay: 120,
|
|
648
|
+
failOnMissing: false,
|
|
649
|
+
}
|
|
650
|
+
: undefined;
|
|
651
|
+
|
|
652
|
+
try {
|
|
653
|
+
const result = await defaultScraper.scrapeUser(username, {
|
|
654
|
+
spinner: !argv["no-spinner"],
|
|
655
|
+
translate: translateOpt,
|
|
656
|
+
});
|
|
657
|
+
|
|
658
|
+
if (argv.json) {
|
|
659
|
+
const out = JSON.stringify(result, null, 2);
|
|
660
|
+
if (argv.output) {
|
|
661
|
+
fs.writeFileSync(path.resolve(argv.output), out + os.EOL, "utf8");
|
|
662
|
+
console.log("Written JSON to", argv.output);
|
|
663
|
+
} else {
|
|
664
|
+
console.log(out);
|
|
665
|
+
}
|
|
666
|
+
} else {
|
|
667
|
+
GithubScraper.printResult(result.profile, result.stats, result.repos);
|
|
668
|
+
if (argv.output) {
|
|
669
|
+
// also write JSON file if requested
|
|
670
|
+
fs.writeFileSync(path.resolve(argv.output), JSON.stringify(result, null, 2) + os.EOL, "utf8");
|
|
671
|
+
console.log("Written JSON to", argv.output);
|
|
672
|
+
}
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
process.exit(0);
|
|
676
|
+
} catch (err) {
|
|
677
|
+
console.error("Error:", err.message || err);
|
|
678
|
+
if (err.cause && err.cause.message) {
|
|
679
|
+
console.error("Cause:", err.cause.message);
|
|
680
|
+
}
|
|
681
|
+
process.exit(1);
|
|
682
|
+
}
|
|
683
|
+
})();
|
|
684
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "h56-github-scrapper",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "GitHub user scraper (programmatic + CLI)",
|
|
5
|
+
"main": "main-scrapping.js",
|
|
6
|
+
"bin": {
|
|
7
|
+
"h56-github-scrapper": "main-scrapping.js"
|
|
8
|
+
},
|
|
9
|
+
"keywords": ["github","scraper","scraping"],
|
|
10
|
+
"license": "MIT",
|
|
11
|
+
"dependencies": {
|
|
12
|
+
"axios": "^1.0.0",
|
|
13
|
+
"cheerio": "^1.0.0-rc.12",
|
|
14
|
+
"ora": "^6.0.0",
|
|
15
|
+
"yargs": "^17.0.0"
|
|
16
|
+
},
|
|
17
|
+
"optionalDependencies": {
|
|
18
|
+
"h56-translator": "^1.0.0"
|
|
19
|
+
},
|
|
20
|
+
"scripts": {
|
|
21
|
+
"postinstall": "node ./scripts/ensure-external-deps.js"
|
|
22
|
+
}
|
|
23
|
+
}
|
package/readme.md
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
1
|
+
# h56-github-scrapper
|
|
2
|
+
|
|
3
|
+
[](https://www.npmjs.com/package/h56-github-scrapper)
|
|
4
|
+
[](https://www.npmjs.com/package/h56-github-scrapper)
|
|
5
|
+
[](https://nodejs.org/)
|
|
6
|
+
[](./LICENSE)
|
|
7
|
+
[](https://www.typescriptlang.org/)
|
|
8
|
+
[](https://www.npmjs.com/package/h56-translator)
|
|
9
|
+
|
|
10
|
+
Ringkasan: h56-github-scrapper adalah paket Node.js ringan untuk mengambil (scrape) informasi profil publik dan repositori pengguna GitHub. Paket ini berfungsi sebagai CLI dan juga dapat diimpor sebagai library programatik. Versi ini menambahkan integrasi opsional dengan layanan terjemahan (`h56-translator`) sehingga Anda dapat memilih teks mana yang ingin diterjemahkan pada output (bio, nama repo, deskripsi repo, dsb).
|
|
11
|
+
|
|
12
|
+
> Peringatan: paket melakukan scraping HTML publik GitHub. Struktur HTML dapat berubah sewaktu-waktu — untuk produksi/skala besar gunakan GitHub REST API (dengan autentikasi). Selalu patuhi Terms of Service GitHub dan etika scraping.
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
## Daftar isi
|
|
17
|
+
|
|
18
|
+
- [Fitur utama](#fitur-utama)
|
|
19
|
+
- [Badge & status](#badge--status)
|
|
20
|
+
- [Persyaratan & instalasi](#persyaratan--instalasi)
|
|
21
|
+
- [Quick start — CLI](#quick-start--cli)
|
|
22
|
+
- [Opsi terjemahan (CLI & programatik)](#opsi-terjemahan-cli--programatik)
|
|
23
|
+
- [API Reference (singkat) — TypeScript interfaces](#api-reference-singkat---typescript-interfaces)
|
|
24
|
+
- [Contoh penggunaan (CommonJS / ESM / TS)](#contoh-penggunaan-commonjs--esm--ts)
|
|
25
|
+
- [Behavior translator opsional & postinstall](#behavior-translator-opsional--postinstall)
|
|
26
|
+
- [Best practices & etika scraping](#best-practices--etika-scraping)
|
|
27
|
+
- [Troubleshooting](#troubleshooting)
|
|
28
|
+
- [Contributing & changelog singkat](#contributing--changelog-singkat)
|
|
29
|
+
- [License](#license)
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## Fitur utama
|
|
34
|
+
|
|
35
|
+
- Ambil data profil publik: username, nama, bio, followers, following, jumlah repo publik, profile_url.
|
|
36
|
+
- Ambil daftar repositori publik: name, description, language, stars, forks, updated_at.
|
|
37
|
+
- Hitung statistik agregat: total_repositories, total_stars, total_forks, top_languages.
|
|
38
|
+
- CLI interaktif + opsi JSON output.
|
|
39
|
+
- API programatik: `scrapeUser`, `scrapeProfile`, `scrapeRepos`, `calculateStats`, `GithubScraper` class.
|
|
40
|
+
- Integrasi terjemahan opsional via `h56-translator` (wrapper tersedia: `translate-engine/translate.(ts|js)`).
|
|
41
|
+
- Retry/backoff, polite delay antar-request, spinner (ora) untuk UX.
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## Badge & status
|
|
46
|
+
|
|
47
|
+
- npm package: lihat badge versi & download di bagian atas.
|
|
48
|
+
- Node: target minimum Node.js >= 16.
|
|
49
|
+
- License: MIT.
|
|
50
|
+
- Translator: opsi integrasi ditandai sebagai optional; install manual `npm install h56-translator` untuk mengaktifkan fitur terjemahan.
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## Persyaratan & instalasi
|
|
55
|
+
|
|
56
|
+
- Node.js >= 16.x direkomendasikan.
|
|
57
|
+
- Instal dari npm:
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
npm install h56-github-scrapper
|
|
61
|
+
# atau
|
|
62
|
+
yarn add h56-github-scrapper
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
Jika Anda ingin menggunakan fitur terjemahan, pasang package terjemahan opsional:
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
npm install h56-translator
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Catatan: paket menyediakan skrip `postinstall` yang berusaha memasang `h56-translator` secara otomatis kecuali di environment CI. Untuk memaksa install di CI gunakan:
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
H56_FORCE_POSTINSTALL=1 npm install
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Namun untuk determinisme CI/CD sebaiknya deklarasikan dependency secara eksplisit di pipeline Anda.
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## Quick start — CLI
|
|
82
|
+
|
|
83
|
+
Sintaks dasar:
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
node main-scrapping.js <username> [--json] [--output=path] [--no-spinner]
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Contoh:
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
# Ringkasan human readable
|
|
93
|
+
node main-scrapping.js HASYIM56
|
|
94
|
+
|
|
95
|
+
# Output JSON ke STDOUT
|
|
96
|
+
node main-scrapping.js HASYIM56 --json
|
|
97
|
+
|
|
98
|
+
# Output JSON ke file
|
|
99
|
+
node main-scrapping.js HASYIM56 --json --output=HASYIM56.json
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### Opsi terjemahan CLI (baru)
|
|
103
|
+
|
|
104
|
+
- `--lang, -l <code>` — target bahasa (mis. `en`, `id`, `fr`)
|
|
105
|
+
- `--translate-fields <comma-separated>` — fields yang ingin diterjemahkan: `bio`, `repo_descriptions`, `repo_names`, `all_repos`
|
|
106
|
+
Default: `bio,repo_descriptions`
|
|
107
|
+
- `--no-spinner` — non-aktifkan spinner (berguna pada CI)
|
|
108
|
+
- `--json` — output JSON
|
|
109
|
+
|
|
110
|
+
Contoh:
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
node main-scrapping.js HASYIM56 --lang=en --translate-fields=bio,repo_descriptions --json --output=HASYIM56-en.json
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Jika translator tidak terpasang, CLI tetap berjalan dan akan menambahkan `_translation_note` pada hasil JSON (default fail-safe). Untuk membuat proses gagal ketika translator tidak ada, gunakan opsi programatik `failOnMissing: true` dengan `scrapeUser`.
|
|
117
|
+
|
|
118
|
+
---
|
|
119
|
+
|
|
120
|
+
## Opsi terjemahan (CLI & programatik)
|
|
121
|
+
|
|
122
|
+
Terjemahan bersifat opsional dan dikontrol melalui:
|
|
123
|
+
|
|
124
|
+
- CLI flags (`--lang`, `--translate-fields`)
|
|
125
|
+
- Programatik: `scrapeUser(username, { translate: { lang, fields, perRepoDelay, failOnMissing } })`
|
|
126
|
+
- Helper langsung: `h56translate(text, targetLang, options?)`
|
|
127
|
+
|
|
128
|
+
Default behavior:
|
|
129
|
+
- Jika `translate` tidak diberikan -> tidak ada terjemahan.
|
|
130
|
+
- Jika `translate.lang` diberikan tapi `h56-translator` tidak ada:
|
|
131
|
+
- Default: tetap kembalikan hasil asli dan tambahkan `_translation_note`.
|
|
132
|
+
- Jika `failOnMissing: true` -> lempar error `TRANSLATOR_MISSING`.
|
|
133
|
+
|
|
134
|
+
Rekomendasi: lakukan translate secara sequential untuk mengurangi beban, atau lakukan paralelisasi terbatas + cache untuk skala.
|
|
135
|
+
|
|
136
|
+
---
|
|
137
|
+
|
|
138
|
+
## API Reference singkat — TypeScript interfaces
|
|
139
|
+
|
|
140
|
+
Berikut ringkasan interface yang relevan (copas ke .d.ts atau file dokumentasi Anda):
|
|
141
|
+
|
|
142
|
+
```ts
|
|
143
|
+
// translate-engine/translate.ts (contract)
|
|
144
|
+
export interface TranslationResult {
|
|
145
|
+
translatedText: string;
|
|
146
|
+
sourceLang: string; // kode bahasa terdeteksi (service-defined)
|
|
147
|
+
targetLang: string; // bahasa target yang diminta
|
|
148
|
+
serviceStatus: 'ok' | 'error';
|
|
149
|
+
raw?: any; // payload mentah dari service (opsional)
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
export interface TranslateOptions {
|
|
153
|
+
endpoint?: string; // default jika disediakan oleh service
|
|
154
|
+
signal?: AbortSignal;
|
|
155
|
+
fetch?: typeof globalThis.fetch;
|
|
156
|
+
timeoutMs?: number; // helper timeout, opsional
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
declare function translate(
|
|
160
|
+
text: string,
|
|
161
|
+
targetLang: string,
|
|
162
|
+
options?: TranslateOptions
|
|
163
|
+
): Promise<TranslationResult>;
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
Core scraping types:
|
|
167
|
+
|
|
168
|
+
```ts
|
|
169
|
+
export interface Profile {
|
|
170
|
+
username: string;
|
|
171
|
+
name: string;
|
|
172
|
+
bio: string;
|
|
173
|
+
followers: number;
|
|
174
|
+
following: number;
|
|
175
|
+
public_repos: number;
|
|
176
|
+
profile_url: string;
|
|
177
|
+
// optional translation fields added dynamically:
|
|
178
|
+
bio_translated?: string;
|
|
179
|
+
bio_source_lang?: string | null;
|
|
180
|
+
bio_translation_meta?: any;
|
|
181
|
+
bio_translation_error?: string;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
export interface Repo {
|
|
185
|
+
name: string;
|
|
186
|
+
description: string;
|
|
187
|
+
language: string;
|
|
188
|
+
stars: number;
|
|
189
|
+
forks: number;
|
|
190
|
+
updated_at?: string;
|
|
191
|
+
// optional translated fields:
|
|
192
|
+
description_translated?: string;
|
|
193
|
+
description_source_lang?: string | null;
|
|
194
|
+
description_translation_meta?: any;
|
|
195
|
+
description_translation_error?: string;
|
|
196
|
+
name_translated?: string;
|
|
197
|
+
name_source_lang?: string | null;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
export interface Stats {
|
|
201
|
+
total_repositories: number;
|
|
202
|
+
total_stars: number;
|
|
203
|
+
total_forks: number;
|
|
204
|
+
top_languages: { language: string; repos: number }[];
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
export function scrapeUser(
|
|
208
|
+
username: string,
|
|
209
|
+
opts?: {
|
|
210
|
+
spinner?: boolean;
|
|
211
|
+
translate?: {
|
|
212
|
+
lang: string;
|
|
213
|
+
fields?: string[];
|
|
214
|
+
perRepoDelay?: number;
|
|
215
|
+
failOnMissing?: boolean;
|
|
216
|
+
};
|
|
217
|
+
}
|
|
218
|
+
): Promise<{ profile: Profile; repos: Repo[]; stats: Stats; _translation_note?: any }>;
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
---
|
|
222
|
+
|
|
223
|
+
## Contoh penggunaan
|
|
224
|
+
|
|
225
|
+
### CommonJS (Node.js)
|
|
226
|
+
|
|
227
|
+
```js
|
|
228
|
+
const {
|
|
229
|
+
scrapeUser,
|
|
230
|
+
h56translate, // optional helper; may throw if not installed
|
|
231
|
+
} = require("h56-github-scrapper");
|
|
232
|
+
|
|
233
|
+
(async () => {
|
|
234
|
+
// tanpa terjemahan
|
|
235
|
+
const { profile, repos, stats } = await scrapeUser("HASYIM56");
|
|
236
|
+
|
|
237
|
+
// dengan terjemahan via opsi
|
|
238
|
+
const translated = await scrapeUser("HASYIM56", {
|
|
239
|
+
translate: { lang: "en", fields: ["bio", "repo_descriptions"], perRepoDelay: 120 }
|
|
240
|
+
});
|
|
241
|
+
console.log(translated.profile.bio_translated);
|
|
242
|
+
|
|
243
|
+
// menggunakan helper langsung (opsional)
|
|
244
|
+
try {
|
|
245
|
+
const r = await h56translate("Halo dunia", "en");
|
|
246
|
+
console.log(r.translatedText);
|
|
247
|
+
} catch (err) {
|
|
248
|
+
console.warn("Translator unavailable:", err.message);
|
|
249
|
+
}
|
|
250
|
+
})();
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
### ESM (dynamic import)
|
|
254
|
+
|
|
255
|
+
```js
|
|
256
|
+
const pkg = await import("h56-github-scrapper");
|
|
257
|
+
const { scrapeUser, h56translate } = pkg;
|
|
258
|
+
|
|
259
|
+
const res = await scrapeUser("HASYIM56", { translate: { lang: "en" } });
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
### Contoh TypeScript (development)
|
|
263
|
+
|
|
264
|
+
```ts
|
|
265
|
+
import { translate } from "./translate-engine/translate";
|
|
266
|
+
const r = await translate("Halo dunia", "en");
|
|
267
|
+
console.log(r.translatedText);
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
---
|
|
271
|
+
|
|
272
|
+
## Behavior translator opsional & postinstall
|
|
273
|
+
|
|
274
|
+
- `h56-translator` adalah dependency opsional. Paket menyediakan:
|
|
275
|
+
- `translate-engine/translate.ts` (typed wrapper) untuk development/TS.
|
|
276
|
+
- `translate-engine/translate.js` (CJS wrapper) untuk runtime require().
|
|
277
|
+
- `scripts/ensure-external-deps.js` — postinstall helper yang berusaha memasang `h56-translator` jika tidak ada, kecuali di CI (safety).
|
|
278
|
+
- Jika Anda menginginkan pemasangan otomatis di CI:
|
|
279
|
+
- jalankan: `H56_FORCE_POSTINSTALL=1 npm install`
|
|
280
|
+
- Jika translator tidak terpasang:
|
|
281
|
+
- `h56translate(...)` akan melempar Error informatif.
|
|
282
|
+
- `scrapeUser(..., { translate: {...} })` akan:
|
|
283
|
+
- menambahkan `_translation_note` dan melanjutkan (default), atau
|
|
284
|
+
- melempar error jika `failOnMissing: true` disetel.
|
|
285
|
+
|
|
286
|
+
---
|
|
287
|
+
|
|
288
|
+
## Best practices & etika scraping
|
|
289
|
+
|
|
290
|
+
- Jangan paralelisasi scraping untuk banyak akun tanpa jeda; gunakan `SCRAPE_DELAY` dan `MAX_RETRY` yang konservatif.
|
|
291
|
+
- Untuk skala besar/penggunaan produksi, gunakan GitHub API (REST) dengan otentikasi.
|
|
292
|
+
- Untuk terjemahan massal:
|
|
293
|
+
- Perhatikan rate limit dan biaya pada layanan penerjemah.
|
|
294
|
+
- Tambahkan cache (memory/file/db) untuk hasil terjemahan agar tidak berulang.
|
|
295
|
+
- Batasi concurrency ketika melakukan banyak permintaan terjemahan.
|
|
296
|
+
- Gunakan logger terpusat (winston/pino) untuk memantau error, retries, dan metrik.
|
|
297
|
+
|
|
298
|
+
---
|
|
299
|
+
|
|
300
|
+
## Troubleshooting
|
|
301
|
+
|
|
302
|
+
- "Optional dependency 'h56-translator' is not available":
|
|
303
|
+
- Jalankan: `npm install h56-translator`
|
|
304
|
+
- Atau jalankan `npm install` ulang dengan `H56_FORCE_POSTINSTALL=1` jika menggunakan postinstall di CI.
|
|
305
|
+
- Parsing kosong/field hilang:
|
|
306
|
+
- GitHub mungkin mengubah markup; periksa selector di `main-scrapping.js`.
|
|
307
|
+
- Performance / timeout:
|
|
308
|
+
- Atur `REQUEST_TIMEOUT`, `MAX_RETRY`, dan `SCRAPE_DELAY` saat membuat `new GithubScraper({...})`.
|
|
309
|
+
|
|
310
|
+
---
|
|
311
|
+
|
|
312
|
+
## Contributing & changelog singkat
|
|
313
|
+
|
|
314
|
+
Kontribusi disambut. Silakan:
|
|
315
|
+
- Buka issue jika akan mengubah API publik.
|
|
316
|
+
- Sertakan test untuk fitur baru.
|
|
317
|
+
- Ikuti style guide dan sertakan deskripsi perubahan pada PR.
|
|
318
|
+
|
|
319
|
+
Changelog singkat (ringkasan):
|
|
320
|
+
- v1.0.0 — Core scraper + optional translator support (h56-translator) + CLI translate flags.
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
// scripts/ensure-external-deps.js
|
|
3
|
+
// Postinstall helper: idempotent installer for optional external deps.
|
|
4
|
+
// - Will attempt to install packages listed in optionalDeps if missing.
|
|
5
|
+
// - Skips automatic install in CI unless H56_FORCE_POSTINSTALL=1
|
|
6
|
+
// - Does not throw on failure to avoid breaking `npm install` completely.
|
|
7
|
+
|
|
8
|
+
const { spawnSync } = require("child_process");
|
|
9
|
+
|
|
10
|
+
const optionalDeps = ["h56-translator"];
|
|
11
|
+
|
|
12
|
+
function isInstalled(name) {
|
|
13
|
+
try {
|
|
14
|
+
require.resolve(name);
|
|
15
|
+
return true;
|
|
16
|
+
} catch (_) {
|
|
17
|
+
return false;
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
function installDeps(deps) {
|
|
22
|
+
if (!deps.length) return;
|
|
23
|
+
const npmCmd = process.platform === "win32" ? "npm.cmd" : "npm";
|
|
24
|
+
const args = ["install", "--no-audit", "--no-fund", "--save", ...deps];
|
|
25
|
+
console.log("Installing optional dependencies:", deps.join(", "));
|
|
26
|
+
const res = spawnSync(npmCmd, args, { stdio: "inherit" });
|
|
27
|
+
if (res.error || res.status !== 0) {
|
|
28
|
+
console.error("Failed to install optional dependencies. You can run manually:");
|
|
29
|
+
console.error(" npm install " + deps.join(" "));
|
|
30
|
+
// do not throw to keep npm install resilient
|
|
31
|
+
} else {
|
|
32
|
+
console.log("Optional dependencies installed.");
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
(function main() {
|
|
37
|
+
try {
|
|
38
|
+
const toInstall = optionalDeps.filter((d) => !isInstalled(d));
|
|
39
|
+
if (toInstall.length === 0) return;
|
|
40
|
+
|
|
41
|
+
// Skip auto-install in CI by default to avoid surprises; allow override.
|
|
42
|
+
if (process.env.CI && !process.env.H56_FORCE_POSTINSTALL) {
|
|
43
|
+
console.log(
|
|
44
|
+
"CI environment detected — skipping automatic installation of optional dependencies.",
|
|
45
|
+
"Set H56_FORCE_POSTINSTALL=1 to force installation in CI."
|
|
46
|
+
);
|
|
47
|
+
return;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
installDeps(toInstall);
|
|
51
|
+
} catch (err) {
|
|
52
|
+
console.error("Postinstall check encountered an error:", err && err.message ? err.message : err);
|
|
53
|
+
// do not exit non-zero; keep postinstall resilient
|
|
54
|
+
}
|
|
55
|
+
})();
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
// translate-engine/translate.js
|
|
2
|
+
// CommonJS runtime wrapper for optional dependency 'h56-translator'.
|
|
3
|
+
// Exports: { translate(text, targetLang, options?) }
|
|
4
|
+
|
|
5
|
+
"use strict";
|
|
6
|
+
|
|
7
|
+
function loadTranslator() {
|
|
8
|
+
try {
|
|
9
|
+
const mod = require("h56-translator");
|
|
10
|
+
// handle shapes: named export, default export function, or default object with translate
|
|
11
|
+
if (mod && typeof mod.translate === "function") return { translate: mod.translate };
|
|
12
|
+
if (typeof mod === "function") return { translate: mod };
|
|
13
|
+
if (mod && mod.default && typeof mod.default === "function") return { translate: mod.default };
|
|
14
|
+
if (mod && mod.default && typeof mod.default.translate === "function")
|
|
15
|
+
return { translate: mod.default.translate };
|
|
16
|
+
throw new Error("h56-translator export shape not recognized");
|
|
17
|
+
} catch (err) {
|
|
18
|
+
const e = new Error(
|
|
19
|
+
"Optional dependency 'h56-translator' is not available. Install it with `npm install h56-translator`."
|
|
20
|
+
);
|
|
21
|
+
e.cause = err;
|
|
22
|
+
throw e;
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* translate(text, targetLang, options?)
|
|
28
|
+
* Delegates directly to h56-translator.
|
|
29
|
+
*/
|
|
30
|
+
async function translate(text, targetLang, options) {
|
|
31
|
+
const t = loadTranslator();
|
|
32
|
+
return await t.translate(text, targetLang, options);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
module.exports = { translate };
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
// translate-engine/translate.ts
|
|
2
|
+
// TypeScript typed wrapper for optional dependency 'h56-translator'.
|
|
3
|
+
// This file provides a well-typed `translate` function for TypeScript consumers.
|
|
4
|
+
// Usage (TS):
|
|
5
|
+
// import { translate } from "./translate-engine/translate";
|
|
6
|
+
// const r = await translate("Halo dunia", "en");
|
|
7
|
+
|
|
8
|
+
export interface TranslationResult {
|
|
9
|
+
translatedText: string;
|
|
10
|
+
sourceLang: string; // kode bahasa terdeteksi (service-defined)
|
|
11
|
+
targetLang: string; // nilai yang diminta
|
|
12
|
+
serviceStatus: "ok" | "error";
|
|
13
|
+
raw?: any;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export interface TranslateOptions {
|
|
17
|
+
endpoint?: string;
|
|
18
|
+
signal?: AbortSignal;
|
|
19
|
+
fetch?: typeof globalThis.fetch;
|
|
20
|
+
timeoutMs?: number;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
type UnderlyingTranslate = (
|
|
24
|
+
text: string,
|
|
25
|
+
targetLang: string,
|
|
26
|
+
options?: TranslateOptions
|
|
27
|
+
) => Promise<TranslationResult>;
|
|
28
|
+
|
|
29
|
+
async function loadTranslator(): Promise<{ translate: UnderlyingTranslate }> {
|
|
30
|
+
try {
|
|
31
|
+
// dynamic import to support both ESM and CJS consumers
|
|
32
|
+
const mod = await import("h56-translator");
|
|
33
|
+
const anyMod: any = mod;
|
|
34
|
+
if (typeof anyMod.translate === "function") {
|
|
35
|
+
return { translate: anyMod.translate as UnderlyingTranslate };
|
|
36
|
+
}
|
|
37
|
+
if (typeof anyMod.default === "function") {
|
|
38
|
+
return { translate: anyMod.default as UnderlyingTranslate };
|
|
39
|
+
}
|
|
40
|
+
if (anyMod.default && typeof anyMod.default.translate === "function") {
|
|
41
|
+
return { translate: anyMod.default.translate as UnderlyingTranslate };
|
|
42
|
+
}
|
|
43
|
+
throw new Error("h56-translator export shape not recognized");
|
|
44
|
+
} catch (err) {
|
|
45
|
+
throw new Error(
|
|
46
|
+
"Optional dependency 'h56-translator' is not available. Install it with `npm install h56-translator`."
|
|
47
|
+
);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* translate(text, targetLang, options?)
|
|
53
|
+
* Thin typed wrapper around h56-translator. Normalizes minimal payload.
|
|
54
|
+
*/
|
|
55
|
+
export async function translate(
|
|
56
|
+
text: string,
|
|
57
|
+
targetLang: string,
|
|
58
|
+
options?: TranslateOptions
|
|
59
|
+
): Promise<TranslationResult> {
|
|
60
|
+
const tmod = await loadTranslator();
|
|
61
|
+
const raw = await (tmod.translate as UnderlyingTranslate)(text, targetLang, options);
|
|
62
|
+
if (!raw || typeof raw.translatedText !== "string") {
|
|
63
|
+
throw new Error("Translation service returned unexpected payload");
|
|
64
|
+
}
|
|
65
|
+
return {
|
|
66
|
+
translatedText: raw.translatedText,
|
|
67
|
+
sourceLang: raw.sourceLang || "",
|
|
68
|
+
targetLang: raw.targetLang || targetLang,
|
|
69
|
+
serviceStatus: raw.serviceStatus || "ok",
|
|
70
|
+
raw: raw.raw || raw,
|
|
71
|
+
};
|
|
72
|
+
}
|