jobsdb-scraper 1.0.4 → 1.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. package/README.md +5 -5
  2. package/dist/{src/cli.js → cli.js} +1 -1
  3. package/package.json +1 -1
  4. /package/dist/{src/NoSandboxPlugin.d.ts → NoSandboxPlugin.d.ts} +0 -0
  5. /package/dist/{src/NoSandboxPlugin.js → NoSandboxPlugin.js} +0 -0
  6. /package/dist/{src/cli.d.ts → cli.d.ts} +0 -0
  7. /package/dist/{src/cloudnode.d.ts → cloudnode.d.ts} +0 -0
  8. /package/dist/{src/cloudnode.js → cloudnode.js} +0 -0
  9. /package/dist/{src/file_io_utils.d.ts → file_io_utils.d.ts} +0 -0
  10. /package/dist/{src/file_io_utils.js → file_io_utils.js} +0 -0
  11. /package/dist/{src/logger.d.ts → logger.d.ts} +0 -0
  12. /package/dist/{src/logger.js → logger.js} +0 -0
  13. /package/dist/{src/parseArguments.d.ts → parseArguments.d.ts} +0 -0
  14. /package/dist/{src/parseArguments.js → parseArguments.js} +0 -0
  15. /package/dist/{src/scrape_jobsdb.d.ts → scrape_jobsdb.d.ts} +0 -0
  16. /package/dist/{src/scrape_jobsdb.js → scrape_jobsdb.js} +0 -0
  17. /package/dist/{src/scrape_operation.d.ts → scrape_operation.d.ts} +0 -0
  18. /package/dist/{src/scrape_operation.js → scrape_operation.js} +0 -0
  19. /package/dist/{src/scrape_utils.d.ts → scrape_utils.d.ts} +0 -0
  20. /package/dist/{src/scrape_utils.js → scrape_utils.js} +0 -0
  21. /package/dist/{src/server.d.ts → server.d.ts} +0 -0
  22. /package/dist/{src/server.js → server.js} +0 -0
  23. /package/dist/{src/types.d.ts → types.d.ts} +0 -0
  24. /package/dist/{src/types.js → types.js} +0 -0
  25. /package/dist/{src/utils.d.ts → utils.d.ts} +0 -0
  26. /package/dist/{src/utils.js → utils.js} +0 -0
package/README.md CHANGED
@@ -25,7 +25,7 @@ A few cool highlights:
25
25
 
26
26
  - While not strictly required, a residential IP address is highly recommended. Run this from your home for safest guarantees to avoid bot detection. If you must run from outside of home, I recommend using a residential IP proxy.
27
27
 
28
- ### Option 1: Install globally (Reccomended for most users)
28
+ ### Option 1: Install globally (Recommended for most users)
29
29
 
30
30
  ```shell script
31
31
 
@@ -61,8 +61,8 @@ npm install --save jobsdb-scraper
61
61
  ```js
62
62
  // Warning: These operations are **NOT** thread-safe.
63
63
  import {scrapeJobsdb, findMaxPages} from 'jobsdb-scraper';
64
- import { ScrapeOptions } from 'jobsdb-scraper/dist/src/types.js';
65
- import type { ScrapeStats } from 'jobsdb-scraper/dist/src/types.js';
64
+ import { ScrapeOptions } from 'jobsdb-scraper/dist/types.js';
65
+ import type { ScrapeStats } from 'jobsdb-scraper/dist/types.js';
66
66
  (async () => {
67
67
  const scrapeops = new ScrapeOptions(
68
68
  //searchResultUrlString (required): The URL of the first page of search results to start scraping from.
@@ -96,7 +96,7 @@ import type { ScrapeStats } from 'jobsdb-scraper/dist/src/types.js';
96
96
  ```
97
97
  3. Alternatively you can run the locally installed package with `npx jobsdb-scraper -h`
98
98
 
99
- The name format of the result file is `jobsdb-<region>-<num_pages>-<YY-MM-DD HH:MM:SS>.<format>` and saved to `<path_to_current_working_directory>/jobsdb_scrape_results` by default. UTC time is used for the date. Jobs are not ordered.
99
+ The name format of the result file is `jobsdb-<region>-<num_pages>-<YY-MM-DD_HH:MM:SS>.<format>` by default and saved to `<path_to_current_working_directory>/jobsdb_scrape_results` by default. The results folder will be created if found not to exist. UTC time is used for the date. Jobs are not ordered.
100
100
 
101
101
  ## Have additional requirements?
102
102
 
@@ -113,4 +113,4 @@ The client program uses the ulixee framework (github.com/ulixee), where each wor
113
113
 
114
114
  ## License
115
115
 
116
- [PROPRIETARY](PROPRIETARY)
116
+ [PROPRIETARY](LICENSE)
@@ -1,2 +1,2 @@
1
1
  #!/usr/bin/env node
2
- import{program as e,Argument as t,Option as o,InvalidArgumentError as r}from"commander";import{compile as s}from"html-to-text";import n from"@ulixee/hero-core";import{TransportBridge as a}from"@ulixee/net";import i,{ConnectionToHeroCore as l}from"@ulixee/hero";import{CorePlugin as c}from"@ulixee/hero-plugin-utils";import*as d from"fs";import u,{unlinkSync as h}from"fs";import{pipeline as m}from"stream/promises";import p from"async";import{Transform as f}from"@json2csv/node";import{DateTime as g}from"luxon";import{flatten as w}from"@json2csv/transforms";import*as b from"path";import $,{dirname as y}from"path";import v from"valid-filename";import{v4 as j}from"uuid";import{Sema as S}from"async-sema";import{flatten as C}from"flat";import{spawn as _}from"child_process";import{fileURLToPath as P}from"url";import k from"queue-fifo";import T from"pino";var I=class extends c{static id="NoSandboxPlugin";onNewBrowser(e,t){this.browserEngine.launchArguments.push("--no-sandbox","--disable-setuid-sandbox")}};function x(e,t,o=40){if(e===t)return;const r=Math.min(e/t,1),s=Math.round(r*o),n="█".repeat(s)+"-".repeat(o-s),a=(100*r).toFixed(2);process.stdout.isTTY?(process.stdout.clearLine(0),process.stdout.cursorTo(0),process.stdout.write(`Progress: [${n}] ${a}%`),e>=t&&process.stdout.write("\n")):console.log(`Progress: [${n}] ${a}%`)}function N(e){return new Promise(t=>setTimeout(t,e))}var L=(...e)=>{process.env.IS_LIBRARY_CALL||console.log(...e)};var D=s({});async function E(e,t){await e.goto(t),await e.waitForLoad("DomContentLoaded");const o=e.querySelector('section[data-automation="search-zero-results"], article[data-job-id]');try{await o.$waitForVisible({timeoutMs:1e4})}catch(e){throw new Error(`Fatal Error: Unable to determine whether ${t} is has search results`)}return!o||"SECTION"===await o.tagName}async function R(e,t){let o;L("Finding number of pages available to scrape on %s",e.href);let r=!1;try{if(void 0===t){r=!0;const e=new a,s=new l(e.transportToCore);o=new n({disableSessionPersistence:!0}),o.use(I),o.addConnection(e.transportToClient),t=new i({noChromeSandbox:!0,blockedResourceTypes:["All"],showChrome:!1,connectionToCore:s})}let s=1,c=1e3,d=new URL(e.href);for(;s<=c;){let e=Math.trunc((s+c)/2);d.searchParams.set("page",e.toString()),await E(t,d.href)?c=e-1:s=e+1}return s-1}catch(e){throw e}finally{r&&(await t.close(),await o.close())}}function F(e){if(!e)return;const t=g.fromISO(e);return t.isValid?t.toFormat("yyyy-MM-dd HH:mm:ss"):void 0}function A(e){for(const t in e){const o=e[t];if("string"==typeof o&&o.match(/^\d{4}-\d{2}-\d{2}/))try{e[t]=F(o)}catch(r){console.warn(`Failed to parse date for key "${t}": ${o}, error: ${r}`),e[t]=null}}return e}async function O(e){const t=["hk.jobsdb.com","th.jobsdb.com"];if(!e||"string"!=typeof e||0===e.length)throw new r("URL must be a non-empty string");let o,s=null,c=null;try{if(t.includes(e)&&(e+="/jobs"),e.startsWith("https://")||(e="https://"+e),o=new URL(e),!t.includes(o.hostname))throw new r(`Invalid search url, urls must have either of the following hostnames [${t}] and point to a valid search results page`);const d=new a;c=new n({disableSessionPersistence:!0}),c.addConnection(d.transportToClient),c.use(I),s=new i({noChromeSandbox:!0,blockedResourceTypes:["All"],connectionToCore:new l(d.transportToCore)});if(await E(s,o.href))throw new r(`Invalid search url, urls must have either of the following hostnames [${t}] and point to a valid search results page`);return o}catch(e){throw e}finally{s&&await s.close(),c&&await c.close()}}function z(e){if("string"!=typeof e||""===e.trim())throw new r("Save directory path must be a non-empty string");const t=e.trim(),o=$.join(process.cwd(),t);if(d.existsSync(o)){if(!d.statSync(o).isDirectory())throw new r(`Save directory path exists but is not a directory: ${t}`);try{d.accessSync(o,d.constants.W_OK)}catch{throw new r(`Provided directory exists but is not writable: ${t}`)}}else s=o,d.existsSync(s)||d.mkdirSync(s,{recursive:!0});var s;return t}var q=class{cookie;constructor(e={}){this.cookie=e}},H=class{jobIds;url;cookie;constructor(e,t=new q,o=[]){this.url=e,this.jobIds=o,this.cookie=t}},U=class{id;searchResultsUrl;cloudNodePort;pageQueue;timeout;logger;scrape_stats;topLevelFields;outQueue;timeoutPromise;timeoutClear;constructor(e,t,o,r,s,n,a,i,l=3600){this.id=e,this.scrape_stats=i,this.searchResultsUrl=t,this.outQueue=r,this.logger=s,this.pageQueue=n,this.timeout=l,this.topLevelFields=a;const{promise:c,clear:d}=function(e,t){let o;return{promise:new Promise((r,s)=>{o=setTimeout(()=>s(t),1e3*e)}),clear:()=>clearTimeout(o)}}(l,"Timeout");this.timeoutPromise=c.then(()=>s.error(`Scrape op ${e} timed out.`)),this.timeoutClear=d,this.cloudNodePort=o}assemble_cookie(e){return Object.entries(e.cookie).map(([e,t])=>`${e}=${t}`).join("; ")}get_dict(e){const t={};for(let o=0;o<e.length;o++)t[e[o].name]=e[o].value;return t}async scrape_job_details(e,t,o,r,s,n){let a=0;let i=!1,l=s.cookie.cookie;const c={accept:"*/*","accept-language":"en-GB,en-US;q=0.9,en;q=0.8","content-type":"application/json",origin:"https://hk.jobsdb.com",Connection:"keep-alive",Cookie:this.assemble_cookie(s.cookie),priority:"u=1, i",referer:s.url.href,"sec-fetch-dest":"empty","sec-fetch-mode":"cors","sec-fetch-site":"same-origin","seek-request-brand":"jobsdb","seek-request-country":"HK","user-agent":o,"x-seek-ec-sessionid":l.JobseekerSessionId??"","x-seek-ec-visitorid":l.JobseekerVisitorId??"","x-seek-site":"chalice"},d={operationName:"jobDetails",variables:{jobId:r,jobDetailsViewedCorrelationId:j(),sessionId:l.JobseekerSessionId??"",zone:"asia-1",locale:"en-HK",languageCode:"en",countryCode:"HK",timezone:"America/New_York"},query:'query jobDetails($jobId: ID!, $jobDetailsViewedCorrelationId: String!, $sessionId: String!, $zone: Zone!, $locale: Locale!, $languageCode: LanguageCodeIso!, $countryCode: CountryCodeIso2!, $timezone: Timezone!) {\n jobDetails(\n id: $jobId\n tracking: {channel: "WEB", jobDetailsViewedCorrelationId: $jobDetailsViewedCorrelationId, sessionId: $sessionId}\n ) {\n job {\n sourceZone\n id\n title\n phoneNumber\n isExpired\n expiresAt {\n dateTimeUtc\n }\n isLinkOut\n contactMatches {\n type\n value\n }\n isVerified\n abstract\n content(platform: WEB)\n status\n listedAt {\n label(context: JOB_POSTED, length: SHORT, timezone: $timezone, locale: $locale)\n dateTimeUtc\n }\n salary {\n currencyLabel(zone: $zone)\n label\n }\n shareLink(platform: WEB, zone: $zone, locale: $locale)\n workTypes {\n label(locale: $locale)\n }\n advertiser {\n id\n name(locale: $locale)\n isVerified\n registrationDate {\n dateTimeUtc\n }\n }\n location {\n label(locale: $locale, type: LONG)\n }\n classifications {\n label(languageCode: $languageCode)\n }\n products {\n branding {\n id\n cover {\n url\n }\n thumbnailCover: cover(isThumbnail: true) {\n url\n }\n logo {\n url\n }\n }\n bullets\n questionnaire {\n questions\n }\n video {\n url\n position\n }\n }\n }\n companyProfile(zone: $zone) {\n id\n name\n companyNameSlug\n shouldDisplayReviews\n branding {\n logo\n }\n overview {\n description {\n paragraphs\n }\n industry\n size {\n description\n }\n website {\n url\n }\n }\n reviewsSummary {\n overallRating {\n numberOfReviews {\n value\n }\n value\n }\n }\n perksAndBenefits {\n title\n }\n }\n companySearchUrl(zone: $zone, languageCode: $languageCode)\n companyTags {\n key(languageCode: $languageCode)\n value\n }\n restrictedApplication(countryCode: $countryCode) {\n label(locale: $locale)\n }\n }\n}'};for(;!i&&a<3;)try{const o=await t.fetch("https://hk.jobsdb.com/graphql",{method:"POST",headers:c,body:JSON.stringify(d)});if(200!==await o.status)throw this.logger.error(`Hero ${this.id}.${e} encountered error status ${await o.status} on job fetch for p${s.url.searchParams.get("page")}`),new Error(`Hero ${this.id}.${e} encountered error status ${await o.status} on job fetch for p${s.url.searchParams.get("page")}`);const r=(await o.json()).data.jobDetails.job;Object.keys(C(r,{delimiter:"_"})).forEach(e=>{this.topLevelFields.has(e)||this.topLevelFields.add(e)}),r.content=D(r.content),this.outQueue.push(r,t=>{if(t)throw this.logger.error(`Hero ${this.id}.${e}'s queue push failed:, ${t.message}`),new Error(`Hero ${this.id}.${e}'s queue push failed:, ${t.message}`)}),i=!0,a++}catch(t){this.logger.error(`Hero ${this.id}.${e} failed to scrape job ${r} on page ${s.url.searchParams.get("page")} on attempt #${a}: ${t.toString()}`)}if(n.release(),3==a)throw this.logger.error(`Hero ${this.id}.${e} failed to scrape job ${r} on page ${s.url.searchParams.get("page")}`),new Error(`Hero ${this.id}.${e} failed to scrape job ${r} on page ${s.url.searchParams.get("page")}`)}async scrape_page_job_details(e,t,o,r,s){let n=[];for(let a of s.jobIds)await o.acquire(),n.push(this.scrape_job_details(e,t,r,a,s,o));await Promise.all(n)}async startWorker(e){const t=new i({noChromeSandbox:!0,blockedResourceTypes:["All"],connectionToCore:{host:`localhost:${this.cloudNodePort}`}});let o=0,r=0;const s=(await t.meta).userAgentString;this.logger.info(`Hero instance ${this.id}.${e} started`);const n=new S(8);try{for(;!this.pageQueue.isEmpty();){let a=[];const i=this.pageQueue.dequeue();this.searchResultsUrl.searchParams.set("page",String(i)),this.logger.info(`Hero ${this.id}.${e} dequeued page ${i}`),await t.goto(this.searchResultsUrl.href),await t.waitForLoad("DomContentLoaded");let l=t.querySelectorAll("article[data-job-id]"),c=await l.length;for(let e=0;e<c;e++){const t=l.item(e);await t.$waitForVisible({timeoutMs:1e4});const o=await t.getAttribute("data-job-id");a.push(o)}if(0===a.length)break;const d=new q(this.get_dict(await t.activeTab.cookieStorage.getItems())),u=new H(this.searchResultsUrl,d,a);await this.scrape_page_job_details(e,t,n,s,u),o++,r+=a.length,this.scrape_stats.totalPagesScraped++,this.scrape_stats.totalJobsScraped+=a.length,this.logger.info(`Hero ${this.id}.${e} successfully scraped page ${i} with ${a.length} jobs`)}this.logger.info(`Hero instance ${this.id}.${e} closed, scraped ${o} pages and ${r} jobs.`)}catch(o){throw this.logger.info(`Hero ${this.id}.${e} failed on ${await t.activeTab.url}`),o}finally{await t.close()}}async scrape_all_jobs(){const e=[];let t=Math.min(this.pageQueue.size(),10);for(let o=0;o<t;o++)e.push(this.startWorker(o));await Promise.all(e)}async __start__(){try{this.logger.info(`Starting scrape operation ${this.id}, using cloud node on port ${this.cloudNodePort}`);if("Timeout"===await Promise.race([this.scrape_all_jobs(),this.timeoutPromise]))throw new Error(`Scrape operation ${this.id} timed out after ${this.timeout} seconds.`)}catch(e){throw this.logger.error(`Scrape operation ${this.id} failed: ${e.message}`),e}finally{this.timeoutClear()}}},M="./jobsdb_scrape_logs";function W(e){return new Promise((t,o)=>{e.stdout?.once("data",e=>{try{const o=parseInt(e.toString());t(o)}catch(e){o(e)}})})}var V=y(P(import.meta.url)),B=y(V),Q="true"===process.env.LOG_ENABLED;Q?function(e){if(d.existsSync(e))try{d.rmSync(e,{recursive:!0,force:!0})}catch(e){console.error("Error while removing logs directory:",e)}}(b.join(B,"jobsdb_scrape_logs")):process.removeAllListeners("warning");var J,K,Y={totalJobsScraped:0,totalPagesScraped:0},G=function(e,t){if(!t)return T({level:"silent"});u.mkdirSync(M,{recursive:!0});const o=(new Date).toISOString().replace(/[:.]/g,"-"),r=`${M}/${e}-${o}.log`,s=T.destination({dest:r,sync:!1});return T({level:"info",name:e},s)}("client",Q),Z=new Set,X=[],ee=0,te=new k,oe=[],re=[],se=[],ne=Date.now()/1e3;async function ae(e){const t=e.maxPages,o=e.searchResultsUrl,r=e.numPages;if(!e.resultFileName){const t=g.utc().toFormat("yyyy-MM-dd_HH:mm:ss");e.resultFileName=`jobsdb-${o.hostname.substring(0,2)}-${r}-${t}`}const s=`${e.resultFileName}.ndjson`;let n=b.join(e.saveDir,s);for(let e=1;e<=r;e++)te.enqueue(e);var a,i;a=n,J=d.createWriteStream(a,{flags:"a",flush:!0,highWaterMark:67108864}),i=J,K=p.queue((e,t)=>{try{const o=JSON.stringify(e)+"\n";i.write(o)?t():i.once("drain",()=>{i.write(o)?t():t(new Error("Write failed after drain"))})}catch(e){t(e)}},1),J.on("error",e=>{throw G.error(`WriteStream error: ${e.message}`),new Error(`WriteStream error: ${e.message}`)}),ee=r>10?2:1;try{for(let e=0;e<ee;e++){const t=_("node",[b.join(B,"src/cloudnode.js"),String(e)],{shell:!1});G.info(`Starting cloudnode ${e}...`),X.push(t)}for(let e=0;e<ee;e++)se.push(await W(X[e])),G.info(`Cloudnode ${e} started on port ${se[e]}`);for(let e=0;e<ee;e++)oe.push(new U(e,o,se[e],K,G.child({module:`scrapeOp${e}`}),te,Z,Y)),re.push(oe[e].__start__()),G.info(`Scrape operation ${e} initialized`);let s=!1;L(`Scraping ${r}/${t} available pages of jobs on ${o.href}.`),G.info(`Scraping ${r}/${t} available pages of jobs on ${o.href}.`);const a=Promise.all(re).finally(()=>{s=!0,te.isEmpty()||console.error("\nCouldn't complete scraping operation at this time, try again in ~1min, if still persists, please file an issue on github")});for(process.env.IS_LIBRARY_CALL&&await a;!s;)x(r-te.size(),r),await N(1e3);return L(),G.info("All scrape operations completed."),await async function(e){return new Promise(t=>{0===e.length()&&0===e.running()?t():e.drain(t)})}(K),G.info("Object queue drained."),await async function(e){return new Promise(t=>{e.once("finish",t),e.end()})}(J),G.info("Outstream closed."),"csv"===e.format&&(await async function(e,t){const o=e.replace(".ndjson",".csv"),r=d.createReadStream(e,{encoding:"utf-8"}),s=d.createWriteStream(o,{encoding:"utf-8"}),n={ndjson:!0,defaultValue:null,transforms:[w({objects:!0,arrays:!0,separator:"_"}),A],fields:[...t]},a=new f(n,{},{});await m(r,a,s)}(n,Z),h(n),n=n.replace(".ndjson",".csv")),L(`Scrape finished in ${Math.floor(Date.now()/1e3-ne)} seconds`),L(`Result file saved to ./${n}`),G.info(`Result file saved to ./${n}`),G.info(`Scrape finished in ${Math.floor(Date.now()/1e3-ne)} seconds`),{resultPath:n,scrape_stats:Y}}catch(e){"EACCES"===e.code?(console.error("The specified result directory does not have write permissions."),G.error("The specified result directory does not have write permissions.")):(console.error("scrape_jobsdb.ts in main:",e),G.error(`Error during scraping: ${e.message}`))}finally{for(let e=0;e<ee;e++)X.length>0&&(G.info(`Shutting down CloudNode ${e} on port ${se[e]}...`),!1===X[e].kill()&&(console.error("Error during CloudNode shutdown"),G.error(`Error during CloudNode ${e} shutdown`)))}}e.command("maxPages").description("Find maximum number of pages available to scrape for a given search results url").addArgument(new t("[searchResultsUrl]","The job listing results url (e.g. https://hk.jobsdb.com/jobs)")).action(async e=>{try{const t=await O(e),o=await R(t);console.log(`Found ${o} pages available to scrape on ${e}`)}catch(e){console.error(e.message)}}),e.command("scrape",{isDefault:!0}).description("Scrape job listings").addArgument(new t("[searchResultsUrl]","The job listing results url (e.g. https://hk.jobsdb.com/jobs)")).addOption(new o("-n, --numPages <number>","Number of pages to scrape").default("all")).addOption(new o("--fn, --resultFileName <saveFileName>","Name of the results save file (without extension)")).addOption(new o("-f, --format <file_format>","File format to use, csv files fields are completely flattened (including arrays), and date/time fields are normalized to SQL DateTime.").default("ndjson").choices(["ndjson","csv"])).addOption(new o("-s, --saveDir <pathToDir>","Relative path directory from where the program is run to store results file (optional)").default("./jobsdb_scrape_results")).action(async(e,t)=>{try{const o=z(t.saveDir);t.resultFileName&&function(e,t){if(!v(`${e}.${t}`))throw new r(`Provided filename "${e}" is invalid`)}(t.resultFileName,t.format);const s=await O(e),[n,a]=await async function(e,t){const o=await R(t);if(-1==o)throw new Error("\nCouldn't find the pages available to scrape, please file an issue on github");if("all"==e)return[o,o];const s=parseInt(e);if(isNaN(s))throw new r("Provided numPages arg is not a number.");if(s<1)throw new r("Provided numPages arg must be >=1");if(o<s)throw new r(`Provided numPages args must be <= ${o}`);return[s,o]}(t.numPages,s);t.numPages=n,t.maxPages=a,t.saveDir=o,t.searchResultsUrl=s}catch(e){return void console.error(e.message)}await ae(t)}),(async()=>{await e.parseAsync(process.argv)})();
2
+ import{program as e,Argument as t,Option as o,InvalidArgumentError as r}from"commander";import{compile as s}from"html-to-text";import n from"@ulixee/hero-core";import{TransportBridge as a}from"@ulixee/net";import i,{ConnectionToHeroCore as l}from"@ulixee/hero";import{CorePlugin as c}from"@ulixee/hero-plugin-utils";import*as u from"fs";import d,{unlinkSync as h}from"fs";import{pipeline as m}from"stream/promises";import p from"async";import{Transform as f}from"@json2csv/node";import{DateTime as g}from"luxon";import{flatten as w}from"@json2csv/transforms";import*as b from"path";import $,{dirname as y}from"path";import v from"valid-filename";import{v4 as j}from"uuid";import{Sema as S}from"async-sema";import{flatten as C}from"flat";import{spawn as _}from"child_process";import{fileURLToPath as P}from"url";import k from"queue-fifo";import T from"pino";var I=class extends c{static id="NoSandboxPlugin";onNewBrowser(e,t){this.browserEngine.launchArguments.push("--no-sandbox","--disable-setuid-sandbox")}};function x(e,t,o=40){if(e===t)return;const r=Math.min(e/t,1),s=Math.round(r*o),n="█".repeat(s)+"-".repeat(o-s),a=(100*r).toFixed(2);process.stdout.isTTY?(process.stdout.clearLine(0),process.stdout.cursorTo(0),process.stdout.write(`Progress: [${n}] ${a}%`),e>=t&&process.stdout.write("\n")):console.log(`Progress: [${n}] ${a}%`)}function N(e){return new Promise(t=>setTimeout(t,e))}var L=(...e)=>{process.env.IS_LIBRARY_CALL||console.log(...e)};var D=s({});async function E(e,t){await e.goto(t),await e.waitForLoad("DomContentLoaded");const o=e.querySelector('section[data-automation="search-zero-results"], article[data-job-id]');try{await o.$waitForVisible({timeoutMs:1e4})}catch(e){throw new Error(`Fatal Error: Unable to determine whether ${t} is has search results`)}return!o||"SECTION"===await o.tagName}async function R(e,t){let o;L("Finding number of pages available to scrape on %s",e.href);let r=!1;try{if(void 0===t){r=!0;const e=new a,s=new l(e.transportToCore);o=new n({disableSessionPersistence:!0}),o.use(I),o.addConnection(e.transportToClient),t=new i({noChromeSandbox:!0,blockedResourceTypes:["All"],showChrome:!1,connectionToCore:s})}let s=1,c=1e3,u=new URL(e.href);for(;s<=c;){let e=Math.trunc((s+c)/2);u.searchParams.set("page",e.toString()),await E(t,u.href)?c=e-1:s=e+1}return s-1}catch(e){throw e}finally{r&&(await t.close(),await o.close())}}function F(e){if(!e)return;const t=g.fromISO(e);return t.isValid?t.toFormat("yyyy-MM-dd HH:mm:ss"):void 0}function O(e){for(const t in e){const o=e[t];if("string"==typeof o&&o.match(/^\d{4}-\d{2}-\d{2}/))try{e[t]=F(o)}catch(r){console.warn(`Failed to parse date for key "${t}": ${o}, error: ${r}`),e[t]=null}}return e}async function A(e){const t=["hk.jobsdb.com","th.jobsdb.com"];if(!e||"string"!=typeof e||0===e.length)throw new r("URL must be a non-empty string");let o,s=null,c=null;try{if(t.includes(e)&&(e+="/jobs"),e.startsWith("https://")||(e="https://"+e),o=new URL(e),!t.includes(o.hostname))throw new r(`Invalid search url, urls must have either of the following hostnames [${t}] and point to a valid search results page`);const u=new a;c=new n({disableSessionPersistence:!0}),c.addConnection(u.transportToClient),c.use(I),s=new i({noChromeSandbox:!0,blockedResourceTypes:["All"],connectionToCore:new l(u.transportToCore)});if(await E(s,o.href))throw new r(`Invalid search url, urls must have either of the following hostnames [${t}] and point to a valid search results page`);return o}catch(e){throw e}finally{s&&await s.close(),c&&await c.close()}}function z(e){if("string"!=typeof e||""===e.trim())throw new r("Save directory path must be a non-empty string");const t=e.trim(),o=$.join(process.cwd(),t);if(u.existsSync(o)){if(!u.statSync(o).isDirectory())throw new r(`Save directory path exists but is not a directory: ${t}`);try{u.accessSync(o,u.constants.W_OK)}catch{throw new r(`Provided directory exists but is not writable: ${t}`)}}else s=o,u.existsSync(s)||u.mkdirSync(s,{recursive:!0});var s;return t}var q=class{cookie;constructor(e={}){this.cookie=e}},H=class{jobIds;url;cookie;constructor(e,t=new q,o=[]){this.url=e,this.jobIds=o,this.cookie=t}},U=class{id;searchResultsUrl;cloudNodePort;pageQueue;timeout;logger;scrape_stats;topLevelFields;outQueue;timeoutPromise;timeoutClear;constructor(e,t,o,r,s,n,a,i,l=3600){this.id=e,this.scrape_stats=i,this.searchResultsUrl=t,this.outQueue=r,this.logger=s,this.pageQueue=n,this.timeout=l,this.topLevelFields=a;const{promise:c,clear:u}=function(e,t){let o;return{promise:new Promise((r,s)=>{o=setTimeout(()=>s(t),1e3*e)}),clear:()=>clearTimeout(o)}}(l,"Timeout");this.timeoutPromise=c.then(()=>s.error(`Scrape op ${e} timed out.`)),this.timeoutClear=u,this.cloudNodePort=o}assemble_cookie(e){return Object.entries(e.cookie).map(([e,t])=>`${e}=${t}`).join("; ")}get_dict(e){const t={};for(let o=0;o<e.length;o++)t[e[o].name]=e[o].value;return t}async scrape_job_details(e,t,o,r,s,n){let a=0;let i=!1,l=s.cookie.cookie;const c={accept:"*/*","accept-language":"en-GB,en-US;q=0.9,en;q=0.8","content-type":"application/json",origin:"https://hk.jobsdb.com",Connection:"keep-alive",Cookie:this.assemble_cookie(s.cookie),priority:"u=1, i",referer:s.url.href,"sec-fetch-dest":"empty","sec-fetch-mode":"cors","sec-fetch-site":"same-origin","seek-request-brand":"jobsdb","seek-request-country":"HK","user-agent":o,"x-seek-ec-sessionid":l.JobseekerSessionId??"","x-seek-ec-visitorid":l.JobseekerVisitorId??"","x-seek-site":"chalice"},u={operationName:"jobDetails",variables:{jobId:r,jobDetailsViewedCorrelationId:j(),sessionId:l.JobseekerSessionId??"",zone:"asia-1",locale:"en-HK",languageCode:"en",countryCode:"HK",timezone:"America/New_York"},query:'query jobDetails($jobId: ID!, $jobDetailsViewedCorrelationId: String!, $sessionId: String!, $zone: Zone!, $locale: Locale!, $languageCode: LanguageCodeIso!, $countryCode: CountryCodeIso2!, $timezone: Timezone!) {\n jobDetails(\n id: $jobId\n tracking: {channel: "WEB", jobDetailsViewedCorrelationId: $jobDetailsViewedCorrelationId, sessionId: $sessionId}\n ) {\n job {\n sourceZone\n id\n title\n phoneNumber\n isExpired\n expiresAt {\n dateTimeUtc\n }\n isLinkOut\n contactMatches {\n type\n value\n }\n isVerified\n abstract\n content(platform: WEB)\n status\n listedAt {\n label(context: JOB_POSTED, length: SHORT, timezone: $timezone, locale: $locale)\n dateTimeUtc\n }\n salary {\n currencyLabel(zone: $zone)\n label\n }\n shareLink(platform: WEB, zone: $zone, locale: $locale)\n workTypes {\n label(locale: $locale)\n }\n advertiser {\n id\n name(locale: $locale)\n isVerified\n registrationDate {\n dateTimeUtc\n }\n }\n location {\n label(locale: $locale, type: LONG)\n }\n classifications {\n label(languageCode: $languageCode)\n }\n products {\n branding {\n id\n cover {\n url\n }\n thumbnailCover: cover(isThumbnail: true) {\n url\n }\n logo {\n url\n }\n }\n bullets\n questionnaire {\n questions\n }\n video {\n url\n position\n }\n }\n }\n companyProfile(zone: $zone) {\n id\n name\n companyNameSlug\n shouldDisplayReviews\n branding {\n logo\n }\n overview {\n description {\n paragraphs\n }\n industry\n size {\n description\n }\n website {\n url\n }\n }\n reviewsSummary {\n overallRating {\n numberOfReviews {\n value\n }\n value\n }\n }\n perksAndBenefits {\n title\n }\n }\n companySearchUrl(zone: $zone, languageCode: $languageCode)\n companyTags {\n key(languageCode: $languageCode)\n value\n }\n restrictedApplication(countryCode: $countryCode) {\n label(locale: $locale)\n }\n }\n}'};for(;!i&&a<3;)try{const o=await t.fetch("https://hk.jobsdb.com/graphql",{method:"POST",headers:c,body:JSON.stringify(u)});if(200!==await o.status)throw this.logger.error(`Hero ${this.id}.${e} encountered error status ${await o.status} on job fetch for p${s.url.searchParams.get("page")}`),new Error(`Hero ${this.id}.${e} encountered error status ${await o.status} on job fetch for p${s.url.searchParams.get("page")}`);const r=(await o.json()).data.jobDetails.job;Object.keys(C(r,{delimiter:"_"})).forEach(e=>{this.topLevelFields.has(e)||this.topLevelFields.add(e)}),r.content=D(r.content),this.outQueue.push(r,t=>{if(t)throw this.logger.error(`Hero ${this.id}.${e}'s queue push failed:, ${t.message}`),new Error(`Hero ${this.id}.${e}'s queue push failed:, ${t.message}`)}),i=!0,a++}catch(t){this.logger.error(`Hero ${this.id}.${e} failed to scrape job ${r} on page ${s.url.searchParams.get("page")} on attempt #${a}: ${t.toString()}`)}if(n.release(),3==a)throw this.logger.error(`Hero ${this.id}.${e} failed to scrape job ${r} on page ${s.url.searchParams.get("page")}`),new Error(`Hero ${this.id}.${e} failed to scrape job ${r} on page ${s.url.searchParams.get("page")}`)}async scrape_page_job_details(e,t,o,r,s){let n=[];for(let a of s.jobIds)await o.acquire(),n.push(this.scrape_job_details(e,t,r,a,s,o));await Promise.all(n)}async startWorker(e){const t=new i({noChromeSandbox:!0,blockedResourceTypes:["All"],connectionToCore:{host:`localhost:${this.cloudNodePort}`}});let o=0,r=0;const s=(await t.meta).userAgentString;this.logger.info(`Hero instance ${this.id}.${e} started`);const n=new S(8);try{for(;!this.pageQueue.isEmpty();){let a=[];const i=this.pageQueue.dequeue();this.searchResultsUrl.searchParams.set("page",String(i)),this.logger.info(`Hero ${this.id}.${e} dequeued page ${i}`),await t.goto(this.searchResultsUrl.href),await t.waitForLoad("DomContentLoaded");let l=t.querySelectorAll("article[data-job-id]"),c=await l.length;for(let e=0;e<c;e++){const t=l.item(e);await t.$waitForVisible({timeoutMs:1e4});const o=await t.getAttribute("data-job-id");a.push(o)}if(0===a.length)break;const u=new q(this.get_dict(await t.activeTab.cookieStorage.getItems())),d=new H(this.searchResultsUrl,u,a);await this.scrape_page_job_details(e,t,n,s,d),o++,r+=a.length,this.scrape_stats.totalPagesScraped++,this.scrape_stats.totalJobsScraped+=a.length,this.logger.info(`Hero ${this.id}.${e} successfully scraped page ${i} with ${a.length} jobs`)}this.logger.info(`Hero instance ${this.id}.${e} closed, scraped ${o} pages and ${r} jobs.`)}catch(o){throw this.logger.info(`Hero ${this.id}.${e} failed on ${await t.activeTab.url}`),o}finally{await t.close()}}async scrape_all_jobs(){const e=[];let t=Math.min(this.pageQueue.size(),10);for(let o=0;o<t;o++)e.push(this.startWorker(o));await Promise.all(e)}async __start__(){try{this.logger.info(`Starting scrape operation ${this.id}, using cloud node on port ${this.cloudNodePort}`);if("Timeout"===await Promise.race([this.scrape_all_jobs(),this.timeoutPromise]))throw new Error(`Scrape operation ${this.id} timed out after ${this.timeout} seconds.`)}catch(e){throw this.logger.error(`Scrape operation ${this.id} failed: ${e.message}`),e}finally{this.timeoutClear()}}},M="./jobsdb_scrape_logs";function W(e){return new Promise((t,o)=>{e.stdout?.once("data",e=>{try{const o=parseInt(e.toString());t(o)}catch(e){o(e)}})})}var V=y(P(import.meta.url)),B=y(V),Q="true"===process.env.LOG_ENABLED;Q?function(e){if(u.existsSync(e))try{u.rmSync(e,{recursive:!0,force:!0})}catch(e){console.error("Error while removing logs directory:",e)}}(b.join(B,"jobsdb_scrape_logs")):process.removeAllListeners("warning");var J,K,Y={totalJobsScraped:0,totalPagesScraped:0},G=function(e,t){if(!t)return T({level:"silent"});d.mkdirSync(M,{recursive:!0});const o=(new Date).toISOString().replace(/[:.]/g,"-"),r=`${M}/${e}-${o}.log`,s=T.destination({dest:r,sync:!1});return T({level:"info",name:e},s)}("client",Q),Z=new Set,X=[],ee=0,te=new k,oe=[],re=[],se=[],ne=Date.now()/1e3;async function ae(e){const t=e.maxPages,o=e.searchResultsUrl,r=e.numPages;if(!e.resultFileName){const t=g.utc().toFormat("yyyy-MM-dd_HH:mm:ss");e.resultFileName=`jobsdb-${o.hostname.substring(0,2)}-${r}-${t}`}const s=`${e.resultFileName}.ndjson`;let n=b.join(e.saveDir,s);for(let e=1;e<=r;e++)te.enqueue(e);var a,i;a=n,J=u.createWriteStream(a,{flags:"a",flush:!0,highWaterMark:67108864}),i=J,K=p.queue((e,t)=>{try{const o=JSON.stringify(e)+"\n";i.write(o)?t():i.once("drain",()=>{i.write(o)?t():t(new Error("Write failed after drain"))})}catch(e){t(e)}},1),J.on("error",e=>{throw G.error(`WriteStream error: ${e.message}`),new Error(`WriteStream error: ${e.message}`)}),ee=r>10?2:1;try{for(let e=0;e<ee;e++){const t=_("node",[b.join(B,"src/cloudnode.js"),String(e)],{shell:!1});G.info(`Starting cloudnode ${e}...`),X.push(t)}for(let e=0;e<ee;e++)se.push(await W(X[e])),G.info(`Cloudnode ${e} started on port ${se[e]}`);for(let e=0;e<ee;e++)oe.push(new U(e,o,se[e],K,G.child({module:`scrapeOp${e}`}),te,Z,Y)),re.push(oe[e].__start__()),G.info(`Scrape operation ${e} initialized`);let s=!1;L(`Scraping ${r}/${t} available pages of jobs on ${o.href}.`),G.info(`Scraping ${r}/${t} available pages of jobs on ${o.href}.`);const a=Promise.all(re).finally(()=>{s=!0,te.isEmpty()||console.error("\nCouldn't complete scraping operation at this time, try again in ~1min, if still persists, please file an issue on github")});for(process.env.IS_LIBRARY_CALL&&await a;!s;)x(r-te.size(),r),await N(1e3);return L(),G.info("All scrape operations completed."),await async function(e){return new Promise(t=>{0===e.length()&&0===e.running()?t():e.drain(t)})}(K),G.info("Object queue drained."),await async function(e){return new Promise(t=>{e.once("finish",t),e.end()})}(J),G.info("Outstream closed."),"csv"===e.format&&(await async function(e,t){const o=e.replace(".ndjson",".csv"),r=u.createReadStream(e,{encoding:"utf-8"}),s=u.createWriteStream(o,{encoding:"utf-8"}),n={ndjson:!0,defaultValue:null,transforms:[w({objects:!0,arrays:!0,separator:"_"}),O],fields:[...t]},a=new f(n,{},{});await m(r,a,s)}(n,Z),h(n),n=n.replace(".ndjson",".csv")),L(`Scrape finished in ${Math.floor(Date.now()/1e3-ne)} seconds`),L(`Result file saved to ./${n}`),G.info(`Result file saved to ./${n}`),G.info(`Scrape finished in ${Math.floor(Date.now()/1e3-ne)} seconds`),{resultPath:n,scrape_stats:Y}}catch(e){"EACCES"===e.code?(console.error("The specified result directory does not have write permissions."),G.error("The specified result directory does not have write permissions.")):(console.error("scrape_jobsdb.ts in main:",e),G.error(`Error during scraping: ${e.message}`))}finally{for(let e=0;e<ee;e++)X.length>0&&(G.info(`Shutting down CloudNode ${e} on port ${se[e]}...`),!1===X[e].kill()&&(console.error("Error during CloudNode shutdown"),G.error(`Error during CloudNode ${e} shutdown`)))}}e.version("1.0.6","-v, --version","Output the version number"),e.command("maxPages").description("Find maximum number of pages available to scrape for a given search results url").addArgument(new t("[searchResultsUrl]","The job listing results url (e.g. https://hk.jobsdb.com/jobs)")).action(async e=>{try{const t=await A(e),o=await R(t);console.log(`Found ${o} pages available to scrape on ${e}`)}catch(e){console.error(e.message)}}),e.command("scrape",{isDefault:!0}).description("Scrape job listings").addArgument(new t("[searchResultsUrl]","The job listing results url (e.g. https://hk.jobsdb.com/jobs)")).addOption(new o("-n, --numPages <number>","Number of pages to scrape").default("all")).addOption(new o("--fn, --resultFileName <saveFileName>","Name of the results save file (without extension)")).addOption(new o("-f, --format <file_format>","File format to use, csv files fields are completely flattened (including arrays), and date/time fields are normalized to SQL DateTime.").default("ndjson").choices(["ndjson","csv"])).addOption(new o("-s, --saveDir <pathToDir>","Relative path directory from where the program is run to store results file (optional)").default("./jobsdb_scrape_results")).action(async(e,t)=>{try{const o=z(t.saveDir);t.resultFileName&&function(e,t){if(!v(`${e}.${t}`))throw new r(`Provided filename "${e}" is invalid`)}(t.resultFileName,t.format);const s=await A(e),[n,a]=await async function(e,t){const o=await R(t);if(-1==o)throw new Error("\nCouldn't find the pages available to scrape, please file an issue on github");if("all"==e)return[o,o];const s=parseInt(e);if(isNaN(s))throw new r("Provided numPages arg is not a number.");if(s<1)throw new r("Provided numPages arg must be >=1");if(o<s)throw new r(`Provided numPages args must be <= ${o}`);return[s,o]}(t.numPages,s);t.numPages=n,t.maxPages=a,t.saveDir=o,t.searchResultsUrl=s}catch(e){return void console.error(e.message)}await ae(t)}),(async()=>{await e.parseAsync(process.argv)})();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "jobsdb-scraper",
3
- "version": "1.0.4",
3
+ "version": "1.0.6",
4
4
  "description": "A tool for automatically collecting job information from JobsDB",
5
5
  "main": "dist/src/scrape_jobsdb.js",
6
6
  "bin": {
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes