h56-github-scrapper 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,684 @@
1
+ /**
2
+ * main-scrapping.js
3
+ *
4
+ * NPM package style single-file scraper for GitHub user data.
5
+ * Package name suggestion: h56-github-scrapper
6
+ *
7
+ * Features:
8
+ * - Exports programmatic functions: scrapeProfile, scrapeRepos, scrapeUser, calculateStats
9
+ * - CLI entry when run directly: node main-scrapping.js <username> [--json] [--output=file]
10
+ * - Optional translator integration (h56-translator) with selectable fields to translate
11
+ * - Automatic install of missing npm runtime dependencies (asks for consent when needed)
12
+ * - Robust retry/backoff, polite scraping delay, spinner (ora) fallback
13
+ * - Well-structured results and JSON output support
14
+ *
15
+ * NOTE:
16
+ * - The recommended (proper) approach for publishing as an npm package is to list dependencies
17
+ * in package.json. The runtime auto-installer is implemented as a convenience only.
18
+ * - Scraping HTML may break if GitHub changes markup. Consider using GitHub API for production.
19
+ */
20
+
21
+ const fs = require("fs");
22
+ const path = require("path");
23
+ const { spawnSync } = require("child_process");
24
+ const os = require("os");
25
+
26
+ // -------------------------
27
+ // Ensure runtime deps
28
+ // -------------------------
29
+ function ensureDependencies(deps = []) {
30
+ const missing = deps.filter((d) => {
31
+ try {
32
+ require.resolve(d);
33
+ return false;
34
+ } catch (e) {
35
+ return true;
36
+ }
37
+ });
38
+
39
+ if (!missing.length) return;
40
+
41
+ console.log(
42
+ `Dependencies missing: ${missing.join(
43
+ ", "
44
+ )}. The script can install them automatically.`
45
+ );
46
+
47
+ // If running non-interactive environment, install automatically.
48
+ let consent = false;
49
+ if (process.env.CI || !process.stdin.isTTY) {
50
+ consent = true;
51
+ console.log("Non-interactive environment detected, installing automatically...");
52
+ } else {
53
+ const rl = require("readline").createInterface({
54
+ input: process.stdin,
55
+ output: process.stdout,
56
+ });
57
+
58
+ const answer = new Promise((resolve) =>
59
+ rl.question(`Install missing dependencies now? (Y/n): `, (a) => {
60
+ rl.close();
61
+ resolve(a.trim().toLowerCase());
62
+ })
63
+ );
64
+
65
+ // synchronous wait via spawnSync is not used here; use await-like pattern:
66
+ // To keep file runnable without top-level await, use de-facto blocking by reading from stdin synchronously not available.
67
+ // We'll perform synchronous prompt by checking answer resolved synchronously above (it won't), so instead do a simpler default:
68
+ // If user doesn't answer quickly, default to yes after 10s.
69
+ // Implementing a small blocking wait using child_process to call 'bash -c read -t 10' is platform-dependent.
70
+ // For simplicity here, assume consent if user presses Enter quickly — we'll read synchronously via question with callback, then block via a small busy loop until resolved.
71
+ let resolved = false;
72
+ let ansValue = "";
73
+ answer.then((v) => {
74
+ resolved = true;
75
+ ansValue = v;
76
+ });
77
+
78
+ // Wait (busy-loop) until resolved — acceptable for a small prompt in CLI tool.
79
+ const waitUntil = Date.now() + 10000; // 10s timeout
80
+ while (!resolved && Date.now() < waitUntil) {
81
+ // small sleep
82
+ const start = Date.now();
83
+ while (Date.now() - start < 50) {}
84
+ }
85
+
86
+ if (!resolved) {
87
+ // default to yes
88
+ consent = true;
89
+ console.log("\nNo answer, defaulting to install.\n");
90
+ } else {
91
+ consent = !ansValue || ansValue === "y" || ansValue === "yes";
92
+ }
93
+ }
94
+
95
+ if (!consent) {
96
+ console.error("Cannot proceed without required dependencies. Exiting.");
97
+ process.exit(1);
98
+ }
99
+
100
+ // Run npm install for missing deps
101
+ console.log(`Installing: ${missing.join(" ")} ...`);
102
+ const args = ["install", "--save", ...missing];
103
+ const result = spawnSync(process.platform === "win32" ? "npm.cmd" : "npm", args, {
104
+ stdio: "inherit",
105
+ shell: false,
106
+ });
107
+
108
+ if (result.error || result.status !== 0) {
109
+ console.error("Automatic installation failed. Please run:");
110
+ console.error(` npm install ${missing.join(" ")}`);
111
+ process.exit(1);
112
+ }
113
+
114
+ console.log("Dependencies installed, continuing...");
115
+ }
116
+
117
+ // required runtime modules
118
+ ensureDependencies(["axios", "cheerio", "ora", "yargs"]);
119
+
120
+ // now require installed modules
121
+ const axios = require("axios");
122
+ const cheerio = require("cheerio");
123
+ const ora = require("ora");
124
+ const yargs = require("yargs");
125
+
126
+ // -------------------------
127
+ // Utilities & Config
128
+ // -------------------------
129
+ const DEFAULT_CONFIG = {
130
+ BASE_URL: "https://github.com",
131
+ REQUEST_TIMEOUT: 15000,
132
+ MAX_RETRY: 3,
133
+ SCRAPE_DELAY: 400, // ms between page fetches
134
+ USER_AGENT:
135
+ "Mozilla/5.0 (compatible; h56-github-scrapper/1.0; +https://github.com/)",
136
+ PER_PAGE: 30,
137
+ };
138
+
139
+ function sleep(ms) {
140
+ return new Promise((r) => setTimeout(r, ms));
141
+ }
142
+
143
+ function parseGithubNumber(text = "") {
144
+ if (!text) return 0;
145
+ text = String(text).toLowerCase().replace(/\s+/g, "").replace(/,/g, "");
146
+ // common cases: 1k 1.2k 1.2m 1,200 1200
147
+ const m = text.match(/^([\d,.]*\d)([km])?$/);
148
+ if (!m) {
149
+ const n = Number(text.replace(/[^0-9.]/g, ""));
150
+ return Number.isFinite(n) ? Math.round(n) : 0;
151
+ }
152
+ const val = parseFloat(m[1]);
153
+ if (m[2] === "k") return Math.round(val * 1000);
154
+ if (m[2] === "m") return Math.round(val * 1000000);
155
+ return Math.round(val);
156
+ }
157
+
158
+ function formatNumber(num) {
159
+ try {
160
+ return new Intl.NumberFormat("en-US").format(num);
161
+ } catch (e) {
162
+ return String(num);
163
+ }
164
+ }
165
+
166
+ function validateUsername(username) {
167
+ return /^[a-zA-Z0-9-]{1,39}$/.test(username);
168
+ }
169
+
170
+ // -------------------------
171
+ // Translator loader (optional)
172
+ // -------------------------
173
+ // Provide a safe, synchronous attempt to load the local CommonJS wrapper
174
+ // translate-engine/translate.js. If not present, translatorFn will be null.
175
+ // Consumers of translation should handle null translatorFn.
176
+ let translatorFn = null;
177
+ try {
178
+ // prefer the local wrapper shipped with the package (does dynamic require to h56-translator)
179
+ const tmod = require("./translate-engine/translate");
180
+ if (tmod && typeof tmod.translate === "function") {
181
+ translatorFn = tmod.translate;
182
+ }
183
+ } catch (e) {
184
+ // not installed / wrapper not available; keep translatorFn null
185
+ translatorFn = null;
186
+ }
187
+
188
+ // -------------------------
189
+ // Scraper class
190
+ // -------------------------
191
+ class GithubScraper {
192
+ constructor(opts = {}) {
193
+ this.config = { ...DEFAULT_CONFIG, ...(opts || {}) };
194
+ this.axios = axios.create({
195
+ timeout: this.config.REQUEST_TIMEOUT,
196
+ headers: { "User-Agent": this.config.USER_AGENT, Accept: "text/html" },
197
+ });
198
+ }
199
+
200
+ async requestWithRetry(url, attempt = 1) {
201
+ try {
202
+ const res = await this.axios.get(url);
203
+ return res.data;
204
+ } catch (err) {
205
+ if (attempt >= this.config.MAX_RETRY) {
206
+ // wrap error with url info
207
+ const e = new Error(`Failed to fetch ${url}: ${err.message}`);
208
+ e.cause = err;
209
+ throw e;
210
+ }
211
+ // backoff
212
+ await sleep(1000 * attempt);
213
+ return this.requestWithRetry(url, attempt + 1);
214
+ }
215
+ }
216
+
217
+ async fetchPage(url) {
218
+ const html = await this.requestWithRetry(url);
219
+ return cheerio.load(html);
220
+ }
221
+
222
+ async scrapeProfile(username) {
223
+ const $ = await this.fetchPage(`${this.config.BASE_URL}/${username}`);
224
+
225
+ if ($("title").text().includes("Not Found")) {
226
+ const e = new Error("Username not found");
227
+ e.code = "NOT_FOUND";
228
+ throw e;
229
+ }
230
+
231
+ const name =
232
+ $('h1[class*="vcard-names"] .p-name').text().trim() ||
233
+ $(".p-name.vcard-fullname").text().trim() ||
234
+ "";
235
+
236
+ const bio =
237
+ $('div[class*="p-note"]').text().trim() ||
238
+ $('div[itemprop="description"]').text().trim() ||
239
+ "";
240
+
241
+ // Pulled from header counters (structure may vary by locale/markup)
242
+ const followersText = $(
243
+ 'a[href$="?tab=followers"], a[href$="?tab=followers"] .text-bold'
244
+ )
245
+ .first()
246
+ .text()
247
+ .trim();
248
+ const followingText = $(
249
+ 'a[href$="?tab=following"], a[href$="?tab=following"] .text-bold'
250
+ )
251
+ .first()
252
+ .text()
253
+ .trim();
254
+ const reposText = $(
255
+ 'a[href$="?tab=repositories"], a[href$="?tab=repositories"] .Counter'
256
+ )
257
+ .first()
258
+ .text()
259
+ .trim();
260
+
261
+ const followers = parseGithubNumber(followersText);
262
+ const following = parseGithubNumber(followingText);
263
+ const public_repos = parseGithubNumber(reposText);
264
+
265
+ return {
266
+ username,
267
+ name,
268
+ bio,
269
+ followers,
270
+ following,
271
+ public_repos,
272
+ profile_url: `${this.config.BASE_URL}/${username}`,
273
+ };
274
+ }
275
+
276
+ async scrapeRepos(username) {
277
+ const repos = [];
278
+ let page = 1;
279
+ while (true) {
280
+ const url = `${this.config.BASE_URL}/${username}?page=${page}&tab=repositories`;
281
+ const $ = await this.fetchPage(url);
282
+ // older layout: li[itemprop='owns'], new layout: div[id^=user-repositories-list] li
283
+ const repoItems =
284
+ $("li[itemprop='owns']").length > 0
285
+ ? $("li[itemprop='owns']")
286
+ : $("#user-repositories-list li");
287
+
288
+ if (!repoItems.length) break;
289
+
290
+ repoItems.each((_, el) => {
291
+ const el$ = $(el);
292
+ const repoName =
293
+ el$.find("a[itemprop='name codeRepository']").text().trim() ||
294
+ el$.find("h3 a").text().trim();
295
+
296
+ const starText =
297
+ el$.find("a[href$='/stargazers']").text().trim() ||
298
+ el$.find("svg[aria-label='star'] + span").text().trim();
299
+ const forkText =
300
+ el$.find("a[href$='/network/members']").text().trim() ||
301
+ el$.find("svg[aria-label='fork'] + span").text().trim();
302
+
303
+ const language =
304
+ el$.find("[itemprop='programmingLanguage']").text().trim() ||
305
+ el$.find(".repo-language-color + span").text().trim() ||
306
+ "Unknown";
307
+
308
+ const description =
309
+ el$.find("p[itemprop='description']").text().trim() ||
310
+ el$.find("p.col-9").text().trim() ||
311
+ "";
312
+
313
+ const updated = el$.find("relative-time").attr("datetime") || "";
314
+
315
+ repos.push({
316
+ name: repoName,
317
+ description,
318
+ stars: parseGithubNumber(starText),
319
+ forks: parseGithubNumber(forkText),
320
+ language: language || "Unknown",
321
+ updated_at: updated,
322
+ });
323
+ });
324
+
325
+ page++;
326
+ await sleep(this.config.SCRAPE_DELAY);
327
+ }
328
+
329
+ return repos;
330
+ }
331
+
332
+ calculateStats(repos) {
333
+ const languageMap = {};
334
+ let totalStars = 0;
335
+ let totalForks = 0;
336
+
337
+ repos.forEach((r) => {
338
+ totalStars += r.stars || 0;
339
+ totalForks += r.forks || 0;
340
+ const lang = r.language || "Unknown";
341
+ languageMap[lang] = (languageMap[lang] || 0) + 1;
342
+ });
343
+
344
+ const top_languages = Object.entries(languageMap)
345
+ .sort((a, b) => b[1] - a[1])
346
+ .map(([language, count]) => ({ language, repos: count }));
347
+
348
+ return {
349
+ total_repositories: repos.length,
350
+ total_stars: totalStars,
351
+ total_forks: totalForks,
352
+ top_languages,
353
+ };
354
+ }
355
+
356
+ /**
357
+ * applyTranslations(result, translateOptions)
358
+ *
359
+ * Mutates the result object by adding translated fields.
360
+ *
361
+ * translateOptions (optional) shape:
362
+ * {
363
+ * lang: string, // target language code (required to perform translations)
364
+ * fields?: string[], // list of fields to translate; supported values:
365
+ * // 'bio' (profile.bio)
366
+ * // 'repo_descriptions' (repo.description)
367
+ * // 'repo_names' (repo.name)
368
+ * // 'all_repos' (alias for repo_descriptions + repo_names)
369
+ * // default: ['bio', 'repo_descriptions']
370
+ * perRepoDelay?: number, // ms delay between repo translations (default 120)
371
+ * failOnMissing?: boolean // if true, throw when translator is not available (default false)
372
+ * }
373
+ */
374
+ async applyTranslations(result, translateOptions = {}) {
375
+ if (!translateOptions || !translateOptions.lang) return result;
376
+ const opts = {
377
+ fields: ["bio", "repo_descriptions"],
378
+ perRepoDelay: 120,
379
+ failOnMissing: false,
380
+ ...translateOptions,
381
+ };
382
+
383
+ // normalize fields
384
+ const fields = new Set();
385
+ for (const f of opts.fields) {
386
+ if (f === "all_repos") {
387
+ fields.add("repo_descriptions");
388
+ fields.add("repo_names");
389
+ } else {
390
+ fields.add(f);
391
+ }
392
+ }
393
+
394
+ // translator function must exist
395
+ const tfn = translatorFn;
396
+ if (!tfn) {
397
+ const msg =
398
+ "Optional translator is not available. Install 'h56-translator' (and ensure translate-engine/translate.js is present) to enable translations.";
399
+ if (opts.failOnMissing) {
400
+ const e = new Error(msg);
401
+ e.code = "TRANSLATOR_MISSING";
402
+ throw e;
403
+ } else {
404
+ // attach a note and skip translations
405
+ result._translation_note = {
406
+ skipped: true,
407
+ reason: msg,
408
+ };
409
+ return result;
410
+ }
411
+ }
412
+
413
+ // perform profile translation
414
+ try {
415
+ if (fields.has("bio") && result.profile && result.profile.bio) {
416
+ try {
417
+ const t = await tfn(result.profile.bio, opts.lang, { timeoutMs: 5000 });
418
+ if (t && typeof t.translatedText === "string") {
419
+ result.profile.bio_translated = t.translatedText;
420
+ result.profile.bio_source_lang = t.sourceLang || null;
421
+ result.profile.bio_translation_meta = { serviceStatus: t.serviceStatus || "ok" };
422
+ }
423
+ } catch (e) {
424
+ result.profile.bio_translation_error = e && e.message ? e.message : String(e);
425
+ }
426
+ }
427
+ } catch (e) {
428
+ // defensive: any translator error should not break the main flow
429
+ result._translation_profile_error = e && e.message ? e.message : String(e);
430
+ }
431
+
432
+ // perform repository translations sequentially (safer)
433
+ if (Array.isArray(result.repos) && result.repos.length > 0) {
434
+ for (const repo of result.repos) {
435
+ try {
436
+ if (fields.has("repo_descriptions") && repo.description) {
437
+ try {
438
+ const t = await tfn(repo.description, opts.lang, { timeoutMs: 5000 });
439
+ if (t && typeof t.translatedText === "string") {
440
+ repo.description_translated = t.translatedText;
441
+ repo.description_source_lang = t.sourceLang || null;
442
+ repo.description_translation_meta = { serviceStatus: t.serviceStatus || "ok" };
443
+ }
444
+ } catch (e) {
445
+ repo.description_translation_error = e && e.message ? e.message : String(e);
446
+ }
447
+ }
448
+ if (fields.has("repo_names") && repo.name) {
449
+ try {
450
+ const t2 = await tfn(repo.name, opts.lang, { timeoutMs: 3000 });
451
+ if (t2 && typeof t2.translatedText === "string") {
452
+ repo.name_translated = t2.translatedText;
453
+ repo.name_source_lang = t2.sourceLang || null;
454
+ repo.name_translation_meta = { serviceStatus: t2.serviceStatus || "ok" };
455
+ }
456
+ } catch (e) {
457
+ repo.name_translation_error = e && e.message ? e.message : String(e);
458
+ }
459
+ }
460
+ } catch (e) {
461
+ // attach per-repo error but continue
462
+ repo.translation_internal_error = e && e.message ? e.message : String(e);
463
+ }
464
+ await sleep(opts.perRepoDelay);
465
+ }
466
+ }
467
+
468
+ return result;
469
+ }
470
+
471
+ // high-level helper
472
+ /**
473
+ * scrapeUser(username, opts)
474
+ *
475
+ * opts:
476
+ * spinner: boolean (default true)
477
+ * translate: {
478
+ * lang: 'en', // target language code (required to enable translations)
479
+ * fields: ['bio','repo_descriptions'], // which fields to translate
480
+ * perRepoDelay: 120, // ms
481
+ * failOnMissing: false // if true, throw when translator missing
482
+ * }
483
+ */
484
+ async scrapeUser(username, opts = {}) {
485
+ if (!validateUsername(username)) {
486
+ const e = new Error("Invalid GitHub username format");
487
+ e.code = "INVALID_USERNAME";
488
+ throw e;
489
+ }
490
+
491
+ const spinner = (opts.spinner !== false) ? ora({ text: `Scraping ${username}...`, spinner: "dots" }).start() : null;
492
+
493
+ try {
494
+ const profile = await this.scrapeProfile(username);
495
+ if (spinner) spinner.text = "Fetching repositories...";
496
+ const repos = await this.scrapeRepos(username);
497
+ if (spinner) spinner.succeed("Scraping completed");
498
+ const stats = this.calculateStats(repos);
499
+
500
+ let result = { profile, repos, stats };
501
+
502
+ // If translation options provided, attempt to apply translations.
503
+ if (opts.translate && opts.translate.lang) {
504
+ if (spinner) spinner.text = "Applying translations...";
505
+ try {
506
+ result = await this.applyTranslations(result, opts.translate);
507
+ } catch (e) {
508
+ // translator errors: if failOnMissing requested, rethrow; otherwise attach note
509
+ if (opts.translate && opts.translate.failOnMissing) {
510
+ if (spinner) spinner.fail("Failed");
511
+ throw e;
512
+ } else {
513
+ // attach note and continue
514
+ result._translation_error = e && e.message ? e.message : String(e);
515
+ }
516
+ }
517
+ }
518
+
519
+ return result;
520
+ } catch (err) {
521
+ if (spinner) spinner.fail("Failed");
522
+ throw err;
523
+ }
524
+ }
525
+
526
+ // CLI pretty print
527
+ static printResult(profile, stats, repos = []) {
528
+ console.log("\n========== GITHUB ACCOUNT ==========\n");
529
+ console.log("Username :", profile.username);
530
+ console.log("Name :", profile.name || "-");
531
+ console.log("Bio :", profile.bio || "-");
532
+ if (profile.bio_translated) console.log("Bio (translated):", profile.bio_translated);
533
+ console.log("Followers :", formatNumber(profile.followers));
534
+ console.log("Following :", formatNumber(profile.following));
535
+ console.log("Repos :", formatNumber(profile.public_repos));
536
+ console.log("Profile :", profile.profile_url);
537
+
538
+ console.log("\n------- Repository Statistics -------\n");
539
+ console.log("Total Repository :", formatNumber(stats.total_repositories));
540
+ console.log("Total Stars :", formatNumber(stats.total_stars));
541
+ console.log("Total Forks :", formatNumber(stats.total_forks));
542
+
543
+ console.log("\nTop Languages:");
544
+ stats.top_languages.forEach((l) => console.log(`• ${l.language} (${l.repos})`));
545
+
546
+ if (repos && repos.length) {
547
+ console.log("\nSample repositories:");
548
+ repos.slice(0, 10).forEach((r) =>
549
+ console.log(
550
+ `- ${r.name} (${r.language}) ★${formatNumber(r.stars)} Forks:${formatNumber(
551
+ r.forks
552
+ )}${r.description_translated ? `\n → ${r.description_translated}` : ""}`
553
+ )
554
+ );
555
+ }
556
+
557
+ console.log("\n====================================\n");
558
+ }
559
+ }
560
+
561
+ // -------------------------
562
+ // Exports (for npm usage)
563
+ // -------------------------
564
+ const defaultScraper = new GithubScraper();
565
+
566
+ // Export translator helper (attempt to load; provide helpful fallback if not available)
567
+ let h56translate;
568
+ try {
569
+ const tmod = require("./translate-engine/translate");
570
+ if (tmod && typeof tmod.translate === "function") {
571
+ h56translate = tmod.translate;
572
+ } else {
573
+ h56translate = async function () {
574
+ throw new Error(
575
+ "Optional translator module loaded but export shape is not recognized. Ensure 'h56-translator' is installed and compatible."
576
+ );
577
+ };
578
+ }
579
+ } catch (e) {
580
+ h56translate = async function () {
581
+ throw new Error(
582
+ "Optional translator is not available. Install it with `npm install h56-translator` or run `npm install` to trigger postinstall."
583
+ );
584
+ };
585
+ }
586
+
587
+ module.exports = {
588
+ GithubScraper,
589
+ defaultScraper,
590
+ scrapeProfile: (username, opts) => defaultScraper.scrapeProfile(username, opts),
591
+ scrapeRepos: (username, opts) => defaultScraper.scrapeRepos(username, opts),
592
+ scrapeUser: (username, opts) => defaultScraper.scrapeUser(username, opts),
593
+ calculateStats: (repos) => defaultScraper.calculateStats(repos),
594
+ printResult: GithubScraper.printResult,
595
+ // new optional helper: h56translate(text, targetLang, options?)
596
+ h56translate,
597
+ };
598
+
599
+ // -------------------------
600
+ // CLI behavior when run directly
601
+ // -------------------------
602
+ if (require.main === module) {
603
+ (async () => {
604
+ const argv = yargs(process.argv.slice(2))
605
+ .usage("Usage: $0 <username> [options]")
606
+ .option("json", {
607
+ alias: "j",
608
+ type: "boolean",
609
+ description: "Output raw JSON",
610
+ })
611
+ .option("output", {
612
+ alias: "o",
613
+ type: "string",
614
+ description: "Write JSON output to file",
615
+ })
616
+ .option("no-spinner", {
617
+ type: "boolean",
618
+ description: "Disable spinner output",
619
+ })
620
+ .option("lang", {
621
+ alias: "l",
622
+ type: "string",
623
+ description: "Optional: translate selected text fields to this language (e.g. en, id)",
624
+ })
625
+ .option("translate-fields", {
626
+ type: "string",
627
+ description:
628
+ "Comma-separated fields to translate (bio,repo_descriptions,repo_names,all_repos). Default: bio,repo_descriptions",
629
+ })
630
+ .demandCommand(1, "Github username is required")
631
+ .help().argv;
632
+
633
+ const username = argv._[0];
634
+
635
+ if (!validateUsername(username)) {
636
+ console.error("Invalid GitHub username format.");
637
+ process.exit(1);
638
+ }
639
+
640
+ // build translate options if requested
641
+ const translateOpt = argv.lang
642
+ ? {
643
+ lang: argv.lang,
644
+ fields: argv["translate-fields"]
645
+ ? argv["translate-fields"].split(",").map((s) => s.trim())
646
+ : undefined,
647
+ perRepoDelay: 120,
648
+ failOnMissing: false,
649
+ }
650
+ : undefined;
651
+
652
+ try {
653
+ const result = await defaultScraper.scrapeUser(username, {
654
+ spinner: !argv["no-spinner"],
655
+ translate: translateOpt,
656
+ });
657
+
658
+ if (argv.json) {
659
+ const out = JSON.stringify(result, null, 2);
660
+ if (argv.output) {
661
+ fs.writeFileSync(path.resolve(argv.output), out + os.EOL, "utf8");
662
+ console.log("Written JSON to", argv.output);
663
+ } else {
664
+ console.log(out);
665
+ }
666
+ } else {
667
+ GithubScraper.printResult(result.profile, result.stats, result.repos);
668
+ if (argv.output) {
669
+ // also write JSON file if requested
670
+ fs.writeFileSync(path.resolve(argv.output), JSON.stringify(result, null, 2) + os.EOL, "utf8");
671
+ console.log("Written JSON to", argv.output);
672
+ }
673
+ }
674
+
675
+ process.exit(0);
676
+ } catch (err) {
677
+ console.error("Error:", err.message || err);
678
+ if (err.cause && err.cause.message) {
679
+ console.error("Cause:", err.cause.message);
680
+ }
681
+ process.exit(1);
682
+ }
683
+ })();
684
+ }
package/package.json ADDED
@@ -0,0 +1,23 @@
1
+ {
2
+ "name": "h56-github-scrapper",
3
+ "version": "1.0.0",
4
+ "description": "GitHub user scraper (programmatic + CLI)",
5
+ "main": "main-scrapping.js",
6
+ "bin": {
7
+ "h56-github-scrapper": "main-scrapping.js"
8
+ },
9
+ "keywords": ["github","scraper","scraping"],
10
+ "license": "MIT",
11
+ "dependencies": {
12
+ "axios": "^1.0.0",
13
+ "cheerio": "^1.0.0-rc.12",
14
+ "ora": "^6.0.0",
15
+ "yargs": "^17.0.0"
16
+ },
17
+ "optionalDependencies": {
18
+ "h56-translator": "^1.0.0"
19
+ },
20
+ "scripts": {
21
+ "postinstall": "node ./scripts/ensure-external-deps.js"
22
+ }
23
+ }
package/readme.md ADDED
@@ -0,0 +1,320 @@
1
+ # h56-github-scrapper
2
+
3
+ [![npm version](https://img.shields.io/npm/v/h56-github-scrapper.svg)](https://www.npmjs.com/package/h56-github-scrapper)
4
+ [![Downloads/month](https://img.shields.io/npm/dm/h56-github-scrapper.svg)](https://www.npmjs.com/package/h56-github-scrapper)
5
+ [![Node](https://img.shields.io/badge/node-%3E%3D16-brightgreen.svg?logo=node.js)](https://nodejs.org/)
6
+ [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](./LICENSE)
7
+ [![TypeScript friendly](https://img.shields.io/badge/types-TypeScript-blue.svg?logo=typescript)](https://www.typescriptlang.org/)
8
+ [![Translator: optional](https://img.shields.io/badge/translator-h56--translator-lightgrey.svg?logo=googletranslate)](https://www.npmjs.com/package/h56-translator)
9
+
10
+ Ringkasan: h56-github-scrapper adalah paket Node.js ringan untuk mengambil (scrape) informasi profil publik dan repositori pengguna GitHub. Paket ini berfungsi sebagai CLI dan juga dapat diimpor sebagai library programatik. Versi ini menambahkan integrasi opsional dengan layanan terjemahan (`h56-translator`) sehingga Anda dapat memilih teks mana yang ingin diterjemahkan pada output (bio, nama repo, deskripsi repo, dsb).
11
+
12
+ > Peringatan: paket melakukan scraping HTML publik GitHub. Struktur HTML dapat berubah sewaktu-waktu — untuk produksi/skala besar gunakan GitHub REST API (dengan autentikasi). Selalu patuhi Terms of Service GitHub dan etika scraping.
13
+
14
+ ---
15
+
16
+ ## Daftar isi
17
+
18
+ - [Fitur utama](#fitur-utama)
19
+ - [Badge & status](#badge--status)
20
+ - [Persyaratan & instalasi](#persyaratan--instalasi)
21
+ - [Quick start — CLI](#quick-start--cli)
22
+ - [Opsi terjemahan (CLI & programatik)](#opsi-terjemahan-cli--programatik)
23
+ - [API Reference (singkat) — TypeScript interfaces](#api-reference-singkat---typescript-interfaces)
24
+ - [Contoh penggunaan (CommonJS / ESM / TS)](#contoh-penggunaan-commonjs--esm--ts)
25
+ - [Behavior translator opsional & postinstall](#behavior-translator-opsional--postinstall)
26
+ - [Best practices & etika scraping](#best-practices--etika-scraping)
27
+ - [Troubleshooting](#troubleshooting)
28
+ - [Contributing & changelog singkat](#contributing--changelog-singkat)
29
+ - [License](#license)
30
+
31
+ ---
32
+
33
+ ## Fitur utama
34
+
35
+ - Ambil data profil publik: username, nama, bio, followers, following, jumlah repo publik, profile_url.
36
+ - Ambil daftar repositori publik: name, description, language, stars, forks, updated_at.
37
+ - Hitung statistik agregat: total_repositories, total_stars, total_forks, top_languages.
38
+ - CLI interaktif + opsi JSON output.
39
+ - API programatik: `scrapeUser`, `scrapeProfile`, `scrapeRepos`, `calculateStats`, `GithubScraper` class.
40
+ - Integrasi terjemahan opsional via `h56-translator` (wrapper tersedia: `translate-engine/translate.(ts|js)`).
41
+ - Retry/backoff, polite delay antar-request, spinner (ora) untuk UX.
42
+
43
+ ---
44
+
45
+ ## Badge & status
46
+
47
+ - npm package: lihat badge versi & download di bagian atas.
48
+ - Node: target minimum Node.js >= 16.
49
+ - License: MIT.
50
+ - Translator: opsi integrasi ditandai sebagai optional; install manual `npm install h56-translator` untuk mengaktifkan fitur terjemahan.
51
+
52
+ ---
53
+
54
+ ## Persyaratan & instalasi
55
+
56
+ - Node.js >= 16.x direkomendasikan.
57
+ - Instal dari npm:
58
+
59
+ ```bash
60
+ npm install h56-github-scrapper
61
+ # atau
62
+ yarn add h56-github-scrapper
63
+ ```
64
+
65
+ Jika Anda ingin menggunakan fitur terjemahan, pasang package terjemahan opsional:
66
+
67
+ ```bash
68
+ npm install h56-translator
69
+ ```
70
+
71
+ Catatan: paket menyediakan skrip `postinstall` yang berusaha memasang `h56-translator` secara otomatis kecuali di environment CI. Untuk memaksa install di CI gunakan:
72
+
73
+ ```bash
74
+ H56_FORCE_POSTINSTALL=1 npm install
75
+ ```
76
+
77
+ Namun untuk determinisme CI/CD sebaiknya deklarasikan dependency secara eksplisit di pipeline Anda.
78
+
79
+ ---
80
+
81
+ ## Quick start — CLI
82
+
83
+ Sintaks dasar:
84
+
85
+ ```bash
86
+ node main-scrapping.js <username> [--json] [--output=path] [--no-spinner]
87
+ ```
88
+
89
+ Contoh:
90
+
91
+ ```bash
92
+ # Ringkasan human readable
93
+ node main-scrapping.js HASYIM56
94
+
95
+ # Output JSON ke STDOUT
96
+ node main-scrapping.js HASYIM56 --json
97
+
98
+ # Output JSON ke file
99
+ node main-scrapping.js HASYIM56 --json --output=HASYIM56.json
100
+ ```
101
+
102
+ ### Opsi terjemahan CLI (baru)
103
+
104
+ - `--lang, -l <code>` — target bahasa (mis. `en`, `id`, `fr`)
105
+ - `--translate-fields <comma-separated>` — fields yang ingin diterjemahkan: `bio`, `repo_descriptions`, `repo_names`, `all_repos`
106
+ Default: `bio,repo_descriptions`
107
+ - `--no-spinner` — non-aktifkan spinner (berguna pada CI)
108
+ - `--json` — output JSON
109
+
110
+ Contoh:
111
+
112
+ ```bash
113
+ node main-scrapping.js HASYIM56 --lang=en --translate-fields=bio,repo_descriptions --json --output=HASYIM56-en.json
114
+ ```
115
+
116
+ Jika translator tidak terpasang, CLI tetap berjalan dan akan menambahkan `_translation_note` pada hasil JSON (default fail-safe). Untuk membuat proses gagal ketika translator tidak ada, gunakan opsi programatik `failOnMissing: true` dengan `scrapeUser`.
117
+
118
+ ---
119
+
120
+ ## Opsi terjemahan (CLI & programatik)
121
+
122
+ Terjemahan bersifat opsional dan dikontrol melalui:
123
+
124
+ - CLI flags (`--lang`, `--translate-fields`)
125
+ - Programatik: `scrapeUser(username, { translate: { lang, fields, perRepoDelay, failOnMissing } })`
126
+ - Helper langsung: `h56translate(text, targetLang, options?)`
127
+
128
+ Default behavior:
129
+ - Jika `translate` tidak diberikan -> tidak ada terjemahan.
130
+ - Jika `translate.lang` diberikan tapi `h56-translator` tidak ada:
131
+ - Default: tetap kembalikan hasil asli dan tambahkan `_translation_note`.
132
+ - Jika `failOnMissing: true` -> lempar error `TRANSLATOR_MISSING`.
133
+
134
+ Rekomendasi: lakukan translate secara sequential untuk mengurangi beban, atau lakukan paralelisasi terbatas + cache untuk skala.
135
+
136
+ ---
137
+
138
+ ## API Reference singkat — TypeScript interfaces
139
+
140
+ Berikut ringkasan interface yang relevan (copas ke .d.ts atau file dokumentasi Anda):
141
+
142
+ ```ts
143
+ // translate-engine/translate.ts (contract)
144
+ export interface TranslationResult {
145
+ translatedText: string;
146
+ sourceLang: string; // kode bahasa terdeteksi (service-defined)
147
+ targetLang: string; // bahasa target yang diminta
148
+ serviceStatus: 'ok' | 'error';
149
+ raw?: any; // payload mentah dari service (opsional)
150
+ }
151
+
152
+ export interface TranslateOptions {
153
+ endpoint?: string; // default jika disediakan oleh service
154
+ signal?: AbortSignal;
155
+ fetch?: typeof globalThis.fetch;
156
+ timeoutMs?: number; // helper timeout, opsional
157
+ }
158
+
159
+ declare function translate(
160
+ text: string,
161
+ targetLang: string,
162
+ options?: TranslateOptions
163
+ ): Promise<TranslationResult>;
164
+ ```
165
+
166
+ Core scraping types:
167
+
168
+ ```ts
169
+ export interface Profile {
170
+ username: string;
171
+ name: string;
172
+ bio: string;
173
+ followers: number;
174
+ following: number;
175
+ public_repos: number;
176
+ profile_url: string;
177
+ // optional translation fields added dynamically:
178
+ bio_translated?: string;
179
+ bio_source_lang?: string | null;
180
+ bio_translation_meta?: any;
181
+ bio_translation_error?: string;
182
+ }
183
+
184
+ export interface Repo {
185
+ name: string;
186
+ description: string;
187
+ language: string;
188
+ stars: number;
189
+ forks: number;
190
+ updated_at?: string;
191
+ // optional translated fields:
192
+ description_translated?: string;
193
+ description_source_lang?: string | null;
194
+ description_translation_meta?: any;
195
+ description_translation_error?: string;
196
+ name_translated?: string;
197
+ name_source_lang?: string | null;
198
+ }
199
+
200
+ export interface Stats {
201
+ total_repositories: number;
202
+ total_stars: number;
203
+ total_forks: number;
204
+ top_languages: { language: string; repos: number }[];
205
+ }
206
+
207
+ export function scrapeUser(
208
+ username: string,
209
+ opts?: {
210
+ spinner?: boolean;
211
+ translate?: {
212
+ lang: string;
213
+ fields?: string[];
214
+ perRepoDelay?: number;
215
+ failOnMissing?: boolean;
216
+ };
217
+ }
218
+ ): Promise<{ profile: Profile; repos: Repo[]; stats: Stats; _translation_note?: any }>;
219
+ ```
220
+
221
+ ---
222
+
223
+ ## Contoh penggunaan
224
+
225
+ ### CommonJS (Node.js)
226
+
227
+ ```js
228
+ const {
229
+ scrapeUser,
230
+ h56translate, // optional helper; may throw if not installed
231
+ } = require("h56-github-scrapper");
232
+
233
+ (async () => {
234
+ // tanpa terjemahan
235
+ const { profile, repos, stats } = await scrapeUser("HASYIM56");
236
+
237
+ // dengan terjemahan via opsi
238
+ const translated = await scrapeUser("HASYIM56", {
239
+ translate: { lang: "en", fields: ["bio", "repo_descriptions"], perRepoDelay: 120 }
240
+ });
241
+ console.log(translated.profile.bio_translated);
242
+
243
+ // menggunakan helper langsung (opsional)
244
+ try {
245
+ const r = await h56translate("Halo dunia", "en");
246
+ console.log(r.translatedText);
247
+ } catch (err) {
248
+ console.warn("Translator unavailable:", err.message);
249
+ }
250
+ })();
251
+ ```
252
+
253
+ ### ESM (dynamic import)
254
+
255
+ ```js
256
+ const pkg = await import("h56-github-scrapper");
257
+ const { scrapeUser, h56translate } = pkg;
258
+
259
+ const res = await scrapeUser("HASYIM56", { translate: { lang: "en" } });
260
+ ```
261
+
262
+ ### Contoh TypeScript (development)
263
+
264
+ ```ts
265
+ import { translate } from "./translate-engine/translate";
266
+ const r = await translate("Halo dunia", "en");
267
+ console.log(r.translatedText);
268
+ ```
269
+
270
+ ---
271
+
272
+ ## Behavior translator opsional & postinstall
273
+
274
+ - `h56-translator` adalah dependency opsional. Paket menyediakan:
275
+ - `translate-engine/translate.ts` (typed wrapper) untuk development/TS.
276
+ - `translate-engine/translate.js` (CJS wrapper) untuk runtime require().
277
+ - `scripts/ensure-external-deps.js` — postinstall helper yang berusaha memasang `h56-translator` jika tidak ada, kecuali di CI (safety).
278
+ - Jika Anda menginginkan pemasangan otomatis di CI:
279
+ - jalankan: `H56_FORCE_POSTINSTALL=1 npm install`
280
+ - Jika translator tidak terpasang:
281
+ - `h56translate(...)` akan melempar Error informatif.
282
+ - `scrapeUser(..., { translate: {...} })` akan:
283
+ - menambahkan `_translation_note` dan melanjutkan (default), atau
284
+ - melempar error jika `failOnMissing: true` disetel.
285
+
286
+ ---
287
+
288
+ ## Best practices & etika scraping
289
+
290
+ - Jangan paralelisasi scraping untuk banyak akun tanpa jeda; gunakan `SCRAPE_DELAY` dan `MAX_RETRY` yang konservatif.
291
+ - Untuk skala besar/penggunaan produksi, gunakan GitHub API (REST) dengan otentikasi.
292
+ - Untuk terjemahan massal:
293
+ - Perhatikan rate limit dan biaya pada layanan penerjemah.
294
+ - Tambahkan cache (memory/file/db) untuk hasil terjemahan agar tidak berulang.
295
+ - Batasi concurrency ketika melakukan banyak permintaan terjemahan.
296
+ - Gunakan logger terpusat (winston/pino) untuk memantau error, retries, dan metrik.
297
+
298
+ ---
299
+
300
+ ## Troubleshooting
301
+
302
+ - "Optional dependency 'h56-translator' is not available":
303
+ - Jalankan: `npm install h56-translator`
304
+ - Atau jalankan `npm install` ulang dengan `H56_FORCE_POSTINSTALL=1` jika menggunakan postinstall di CI.
305
+ - Parsing kosong/field hilang:
306
+ - GitHub mungkin mengubah markup; periksa selector di `main-scrapping.js`.
307
+ - Performance / timeout:
308
+ - Atur `REQUEST_TIMEOUT`, `MAX_RETRY`, dan `SCRAPE_DELAY` saat membuat `new GithubScraper({...})`.
309
+
310
+ ---
311
+
312
+ ## Contributing & changelog singkat
313
+
314
+ Kontribusi disambut. Silakan:
315
+ - Buka issue jika akan mengubah API publik.
316
+ - Sertakan test untuk fitur baru.
317
+ - Ikuti style guide dan sertakan deskripsi perubahan pada PR.
318
+
319
+ Changelog singkat (ringkasan):
320
+ - v1.0.0 — Core scraper + optional translator support (h56-translator) + CLI translate flags.
@@ -0,0 +1,55 @@
1
+ #!/usr/bin/env node
2
+ // scripts/ensure-external-deps.js
3
+ // Postinstall helper: idempotent installer for optional external deps.
4
+ // - Will attempt to install packages listed in optionalDeps if missing.
5
+ // - Skips automatic install in CI unless H56_FORCE_POSTINSTALL=1
6
+ // - Does not throw on failure to avoid breaking `npm install` completely.
7
+
8
+ const { spawnSync } = require("child_process");
9
+
10
+ const optionalDeps = ["h56-translator"];
11
+
12
+ function isInstalled(name) {
13
+ try {
14
+ require.resolve(name);
15
+ return true;
16
+ } catch (_) {
17
+ return false;
18
+ }
19
+ }
20
+
21
+ function installDeps(deps) {
22
+ if (!deps.length) return;
23
+ const npmCmd = process.platform === "win32" ? "npm.cmd" : "npm";
24
+ const args = ["install", "--no-audit", "--no-fund", "--save", ...deps];
25
+ console.log("Installing optional dependencies:", deps.join(", "));
26
+ const res = spawnSync(npmCmd, args, { stdio: "inherit" });
27
+ if (res.error || res.status !== 0) {
28
+ console.error("Failed to install optional dependencies. You can run manually:");
29
+ console.error(" npm install " + deps.join(" "));
30
+ // do not throw to keep npm install resilient
31
+ } else {
32
+ console.log("Optional dependencies installed.");
33
+ }
34
+ }
35
+
36
+ (function main() {
37
+ try {
38
+ const toInstall = optionalDeps.filter((d) => !isInstalled(d));
39
+ if (toInstall.length === 0) return;
40
+
41
+ // Skip auto-install in CI by default to avoid surprises; allow override.
42
+ if (process.env.CI && !process.env.H56_FORCE_POSTINSTALL) {
43
+ console.log(
44
+ "CI environment detected — skipping automatic installation of optional dependencies.",
45
+ "Set H56_FORCE_POSTINSTALL=1 to force installation in CI."
46
+ );
47
+ return;
48
+ }
49
+
50
+ installDeps(toInstall);
51
+ } catch (err) {
52
+ console.error("Postinstall check encountered an error:", err && err.message ? err.message : err);
53
+ // do not exit non-zero; keep postinstall resilient
54
+ }
55
+ })();
@@ -0,0 +1,35 @@
1
+ // translate-engine/translate.js
2
+ // CommonJS runtime wrapper for optional dependency 'h56-translator'.
3
+ // Exports: { translate(text, targetLang, options?) }
4
+
5
+ "use strict";
6
+
7
+ function loadTranslator() {
8
+ try {
9
+ const mod = require("h56-translator");
10
+ // handle shapes: named export, default export function, or default object with translate
11
+ if (mod && typeof mod.translate === "function") return { translate: mod.translate };
12
+ if (typeof mod === "function") return { translate: mod };
13
+ if (mod && mod.default && typeof mod.default === "function") return { translate: mod.default };
14
+ if (mod && mod.default && typeof mod.default.translate === "function")
15
+ return { translate: mod.default.translate };
16
+ throw new Error("h56-translator export shape not recognized");
17
+ } catch (err) {
18
+ const e = new Error(
19
+ "Optional dependency 'h56-translator' is not available. Install it with `npm install h56-translator`."
20
+ );
21
+ e.cause = err;
22
+ throw e;
23
+ }
24
+ }
25
+
26
+ /**
27
+ * translate(text, targetLang, options?)
28
+ * Delegates directly to h56-translator.
29
+ */
30
+ async function translate(text, targetLang, options) {
31
+ const t = loadTranslator();
32
+ return await t.translate(text, targetLang, options);
33
+ }
34
+
35
+ module.exports = { translate };
@@ -0,0 +1,72 @@
1
+ // translate-engine/translate.ts
2
+ // TypeScript typed wrapper for optional dependency 'h56-translator'.
3
+ // This file provides a well-typed `translate` function for TypeScript consumers.
4
+ // Usage (TS):
5
+ // import { translate } from "./translate-engine/translate";
6
+ // const r = await translate("Halo dunia", "en");
7
+
8
+ export interface TranslationResult {
9
+ translatedText: string;
10
+ sourceLang: string; // kode bahasa terdeteksi (service-defined)
11
+ targetLang: string; // nilai yang diminta
12
+ serviceStatus: "ok" | "error";
13
+ raw?: any;
14
+ }
15
+
16
+ export interface TranslateOptions {
17
+ endpoint?: string;
18
+ signal?: AbortSignal;
19
+ fetch?: typeof globalThis.fetch;
20
+ timeoutMs?: number;
21
+ }
22
+
23
+ type UnderlyingTranslate = (
24
+ text: string,
25
+ targetLang: string,
26
+ options?: TranslateOptions
27
+ ) => Promise<TranslationResult>;
28
+
29
+ async function loadTranslator(): Promise<{ translate: UnderlyingTranslate }> {
30
+ try {
31
+ // dynamic import to support both ESM and CJS consumers
32
+ const mod = await import("h56-translator");
33
+ const anyMod: any = mod;
34
+ if (typeof anyMod.translate === "function") {
35
+ return { translate: anyMod.translate as UnderlyingTranslate };
36
+ }
37
+ if (typeof anyMod.default === "function") {
38
+ return { translate: anyMod.default as UnderlyingTranslate };
39
+ }
40
+ if (anyMod.default && typeof anyMod.default.translate === "function") {
41
+ return { translate: anyMod.default.translate as UnderlyingTranslate };
42
+ }
43
+ throw new Error("h56-translator export shape not recognized");
44
+ } catch (err) {
45
+ throw new Error(
46
+ "Optional dependency 'h56-translator' is not available. Install it with `npm install h56-translator`."
47
+ );
48
+ }
49
+ }
50
+
51
+ /**
52
+ * translate(text, targetLang, options?)
53
+ * Thin typed wrapper around h56-translator. Normalizes minimal payload.
54
+ */
55
+ export async function translate(
56
+ text: string,
57
+ targetLang: string,
58
+ options?: TranslateOptions
59
+ ): Promise<TranslationResult> {
60
+ const tmod = await loadTranslator();
61
+ const raw = await (tmod.translate as UnderlyingTranslate)(text, targetLang, options);
62
+ if (!raw || typeof raw.translatedText !== "string") {
63
+ throw new Error("Translation service returned unexpected payload");
64
+ }
65
+ return {
66
+ translatedText: raw.translatedText,
67
+ sourceLang: raw.sourceLang || "",
68
+ targetLang: raw.targetLang || targetLang,
69
+ serviceStatus: raw.serviceStatus || "ok",
70
+ raw: raw.raw || raw,
71
+ };
72
+ }