jobsdb-scraper 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Proprietary License
2
+
3
+ Copyright (c) [2025] [Krish Galani, krishdgala@gmail.com]. All rights reserved.
4
+
5
+ This software is proprietary and confidential.
6
+
7
+ Granted Rights:
8
+ - You may install and use the Package via npm.
9
+ - You may execute the Package as a CLI tool.
10
+ - You may import and use the Package as a library in your own software projects.
11
+
12
+ Restrictions:
13
+ - You may not redistribute, sublicense, or make available the Package (or any part of it) to third parties, except as bundled within your own applications.
14
+ - You may not modify, reverse-engineer, decompile, disassemble, or create derivative works of the Package for the purpose of redistribution or public release.
15
+ - You may not remove or alter any copyright, trademark, or other proprietary notices in the Package.
16
+
17
+ No Warranty:
18
+ THE PACKAGE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY ARISING FROM, OUT OF, OR IN CONNECTION WITH THE PACKAGE OR THE USE THEREOF.
19
+
20
+ This license applies only to the distributed version of the package published on npm. The original source code remains fully reserved and is not licensed for any use beyond what is explicitly granted above.
package/README.md ADDED
@@ -0,0 +1,116 @@
1
+
2
+ # JobsDB Scraper
3
+
4
+ <img src="https://raw.githubusercontent.com/krishgalani/jobsdb-scraper/main/assets/jobsdb.png" width="300" alt="JobsDB Logo"><br>
5
+
6
+ ![Static Badge](https://img.shields.io/badge/npm-package?logo=npm&logoSize=auto&color=red&link=https%3A%2F%2Fwww.npmjs.com%2Fpackage%2Fjobsdb-scraper)
7
+
8
+
9
+ About this scraper:
10
+
11
+ - You can use this scraper to download publicly advertised job information on any job results page (e.g. Salaries, job requirements, etc).
12
+
13
+ A few cool highlights:
14
+
15
+ - **Lightweight,and made to run on commodity computers** - Low memory/cpu utilization due to efficient use of modern web-scraping framework (https://github.com/ulixee).
16
+ - **Mindful of Traffic** - Compliant with website rate limits and mindful bot behavior.
17
+ - **Avoids detection along the entire stack** - High guarantees on ability to safely scrape jobs and sidestep anti-bot measures.
18
+ - **Customize which pages and how many you want to scrape** - You can specify any valid JobsDB search results url and the number of pages you want to scrape up to a maximum of all.
19
+
20
+ ## Installation & Usage Instructions
21
+
22
+ ### Requirements:
23
+
24
+ - **18** >= **Node.js** version <= **22** If not installed, [go here](https://nodejs.org/en/download/) to download it. You can check with node --version, and switch versions with `nvm use <node_version>`, or `nvm alias default <node_version>` to set your default node version. **Warning, if you use the wrong node version you may get an error when trying to run.**
25
+
26
+ - While not strictly required, a residential IP address is highly recommended. Run this from your home for safest guarantees to avoid bot detection. If you must run from outside of home, I recommend using a residential IP proxy.
27
+
28
+ ### Option 1: Install globally (Reccomended for most users)
29
+
30
+ ```shell script
31
+
32
+ # 1. In your CLI install the package globally with, this may take a few minutes.
33
+ npm install -g jobsdb-scraper
34
+
35
+ # To find the max available pages to scrape for a given JobsDB search results url:
36
+ jobsdb-scraper maxPages <searchResultsUrl>
37
+
38
+ #For instructions on how to run the scraper (can take up to ~10m):
39
+ jobsdb-scraper scrape -h
40
+ ```
41
+ ### Usage Examples
42
+ ```shell script
43
+ # Scrape 50 pages of jobs in Hong Kong and return results in ndjson format
44
+ jobsdb-scraper hk.jobsdb.com/jobs -n 50 -f ndjson
45
+
46
+ # Scrape all Software Engineering jobs in Hong Kong and return results in csv format, save to a folder called results from the current working directory.
47
+ jobsdb-scraper hk.jobsdb.com/Software-Engineer-jobs -f csv -n 'all' -s './results'
48
+
49
+ # Scrape all accounting jobs in Thailand and return results in ndjson format, set the output file name to "accounting_jobs"
50
+ jobsdb-scraper th.jobsdb.com/jobs-in-accounting -f ndjson -n 'all' --fn accounting_jobs
51
+ ```
52
+ ### Option 2: Install package as a dependency
53
+
54
+ 1. Open CLI In your project root:
55
+ ```shell script
56
+ # This may take a few minutes.
57
+ npm install --save jobsdb-scraper
58
+ ```
59
+
60
+ 2. Import and use!
61
+ ```js
62
+ // Warning: These operations are **NOT** thread-safe.
63
+ import {scrapeJobsdb, findMaxPages} from 'jobsdb-scraper';
64
+ import { ScrapeOptions } from 'jobsdb-scraper/dist/src/types.js';
65
+ import type { ScrapeStats } from 'jobsdb-scraper/dist/src/types.js';
66
+ (async () => {
67
+ const scrapeops = new ScrapeOptions(
68
+ //searchResultUrlString (required): The URL of the first page of search results to start scraping from.
69
+ searchResultsUrlString: 'hk.jobsdb.com/jobs',
70
+ //numPages (optional): The number of pages to scrape, 'all' by default
71
+ numPages: 1,
72
+ //saveDir (optional): The directory relative to the current working directory where you want to save results.
73
+ saveDir: './jobsdb-scrape-results',
74
+ //format (optional): The format in which you want to save the results. Ndjson or csv. Ndjson by default.
75
+ format: 'ndjson',
76
+ //The name of the result file (required)
77
+ resultFileName: 'my_scrape_results',
78
+ )
79
+ try {
80
+ //Will throw if invalid search results URL provided
81
+ const maxPagesHk = await findMaxPages('hk.jobsdb.com/jobs')
82
+ console.log(`Max Pages in HK JobsDB: ${maxPagesHk}`)
83
+ //Will throw if any invalid scrape options
84
+ const scrape_result = await scrapeJobsdb(scrapeops)
85
+ if(scrape_result !== undefined){
86
+ const { resultPath, scrape_stats } = scrape_result
87
+ const { totalJobsScraped, totalPagesScraped }: ScrapeStats = scrape_stats
88
+ console.log(`Total Jobs Scraped: ${totalJobsScraped}`)
89
+ console.log(`Total Pages Scraped: ${totalPagesScraped}`)
90
+ console.log(`Results saved to: ${resultPath}`);
91
+ }
92
+ } catch (error: any){
93
+ //handle the error here
94
+ }
95
+ })();
96
+ ```
97
+ 3. Alternatively you can run the locally installed package with `npx jobsdb-scraper -h`
98
+
99
+ The name format of the result file is `jobsdb-<region>-<num_pages>-<YY-MM-DD HH:MM:SS>.<format>` and saved to `<path_to_current_working_directory>/jobsdb_scrape_results` by default. UTC time is used for the date. Jobs are not ordered.
100
+
101
+ ## Have additional requirements?
102
+
103
+ You can contact me at krishdgala@gmail.com with your requirements for a quote. Further data scraping & analysis can be performed to suite your needs.
104
+
105
+ ## Questions or Bugs?
106
+ Please raise an issue on Github.
107
+
108
+ ## How it works
109
+
110
+ The server part of the program is represented by a maximum of two @ulixee/cloud locally hosted server nodes as the engines behind page navigation and fetches, both hosting a browser with many browsing environments. The decision to use two cloud nodes at most was made after testing for the most amount of parralel nodes that can be run before run-time is impacted (tests run on an M1 Macbook Air).
111
+
112
+ The client program uses the ulixee framework (github.com/ulixee), where each worker (a @ulixee/hero instance) is connected to a respective @ulixee/cloud server node and has a browser environment. It pops a page to scrape from the shared queue of requested pages, makes GETS and POST fetches to the jobsdb HTTP/GraphQL web server for the relevant data. For each page, first the jobIds are parsed from the returned HTML response. Then for each jobId a fetch to the backend GraphQL DB is initiated for job details. The results are received in real time and written to a file locally.
113
+
114
+ ## License
115
+
116
+ [PROPRIETARY](PROPRIETARY)
Binary file
@@ -0,0 +1,8 @@
1
+ import { CorePlugin } from '@ulixee/hero-plugin-utils';
2
+
3
+ declare class NoSandboxPlugin extends CorePlugin {
4
+ static readonly id = "NoSandboxPlugin";
5
+ onNewBrowser(browser: any, userConfig: any): void;
6
+ }
7
+
8
+ export { NoSandboxPlugin as default };
@@ -0,0 +1 @@
1
+ import{CorePlugin as s}from"@ulixee/hero-plugin-utils";var e=class extends s{static id="NoSandboxPlugin";onNewBrowser(s,e){this.browserEngine.launchArguments.push("--no-sandbox","--disable-setuid-sandbox")}};export{e as default};
@@ -0,0 +1 @@
1
+ #!/usr/bin/env node
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env node
2
+ import{program as e,Argument as t,Option as o,InvalidArgumentError as r}from"commander";import{compile as s}from"html-to-text";import n from"@ulixee/hero-core";import{TransportBridge as a}from"@ulixee/net";import i,{ConnectionToHeroCore as l}from"@ulixee/hero";import{CorePlugin as c}from"@ulixee/hero-plugin-utils";import*as d from"fs";import u,{unlinkSync as h}from"fs";import{pipeline as m}from"stream/promises";import p from"async";import{Transform as f}from"@json2csv/node";import{DateTime as g}from"luxon";import{flatten as w}from"@json2csv/transforms";import*as b from"path";import $,{dirname as y}from"path";import v from"valid-filename";import{v4 as j}from"uuid";import{Sema as S}from"async-sema";import{flatten as C}from"flat";import{spawn as _}from"child_process";import{fileURLToPath as P}from"url";import k from"queue-fifo";import T from"pino";var I=class extends c{static id="NoSandboxPlugin";onNewBrowser(e,t){this.browserEngine.launchArguments.push("--no-sandbox","--disable-setuid-sandbox")}};function x(e,t,o=40){if(e===t)return;const r=Math.min(e/t,1),s=Math.round(r*o),n="█".repeat(s)+"-".repeat(o-s),a=(100*r).toFixed(2);process.stdout.isTTY?(process.stdout.clearLine(0),process.stdout.cursorTo(0),process.stdout.write(`Progress: [${n}] ${a}%`),e>=t&&process.stdout.write("\n")):console.log(`Progress: [${n}] ${a}%`)}function N(e){return new Promise(t=>setTimeout(t,e))}var L=(...e)=>{process.env.IS_LIBRARY_CALL||console.log(...e)};var D=s({});async function E(e,t){await e.goto(t),await e.waitForLoad("DomContentLoaded");const o=e.querySelector('section[data-automation="search-zero-results"], article[data-job-id]');try{await o.$waitForVisible({timeoutMs:1e4})}catch(e){throw new Error(`Fatal Error: Unable to determine whether ${t} is has search results`)}return!o||"SECTION"===await o.tagName}async function R(e,t){let o;L("Finding number of pages available to scrape on %s",e.href);let r=!1;try{if(void 0===t){r=!0;const e=new a,s=new l(e.transportToCore);o=new n({disableSessionPersistence:!0}),o.use(I),o.addConnection(e.transportToClient),t=new i({noChromeSandbox:!0,blockedResourceTypes:["All"],showChrome:!1,connectionToCore:s})}let s=1,c=1e3,d=new URL(e.href);for(;s<=c;){let e=Math.trunc((s+c)/2);d.searchParams.set("page",e.toString()),await E(t,d.href)?c=e-1:s=e+1}return s-1}catch(e){throw e}finally{r&&(await t.close(),await o.close())}}function F(e){if(!e)return;const t=g.fromISO(e);return t.isValid?t.toFormat("yyyy-MM-dd HH:mm:ss"):void 0}function A(e){for(const t in e){const o=e[t];if("string"==typeof o&&o.match(/^\d{4}-\d{2}-\d{2}/))try{e[t]=F(o)}catch(r){console.warn(`Failed to parse date for key "${t}": ${o}, error: ${r}`),e[t]=null}}return e}async function O(e){const t=["hk.jobsdb.com","th.jobsdb.com"];if(!e||"string"!=typeof e||0===e.length)throw new r("URL must be a non-empty string");let o,s=null,c=null;try{if(t.includes(e)&&(e+="/jobs"),e.startsWith("https://")||(e="https://"+e),o=new URL(e),!t.includes(o.hostname))throw new r(`Invalid search url, urls must have either of the following hostnames [${t}] and point to a valid search results page`);const d=new a;c=new n({disableSessionPersistence:!0}),c.addConnection(d.transportToClient),c.use(I),s=new i({noChromeSandbox:!0,blockedResourceTypes:["All"],connectionToCore:new l(d.transportToCore)});if(await E(s,o.href))throw new r(`Invalid search url, urls must have either of the following hostnames [${t}] and point to a valid search results page`);return o}catch(e){throw e}finally{s&&await s.close(),c&&await c.close()}}function z(e){if("string"!=typeof e||""===e.trim())throw new r("Save directory path must be a non-empty string");const t=e.trim(),o=$.join(process.cwd(),t);if(d.existsSync(o)){if(!d.statSync(o).isDirectory())throw new r(`Save directory path exists but is not a directory: ${t}`);try{d.accessSync(o,d.constants.W_OK)}catch{throw new r(`Provided directory exists but is not writable: ${t}`)}}else s=o,d.existsSync(s)||d.mkdirSync(s,{recursive:!0});var s;return t}var q=class{cookie;constructor(e={}){this.cookie=e}},H=class{jobIds;url;cookie;constructor(e,t=new q,o=[]){this.url=e,this.jobIds=o,this.cookie=t}},U=class{id;searchResultsUrl;cloudNodePort;pageQueue;timeout;logger;scrape_stats;topLevelFields;outQueue;timeoutPromise;timeoutClear;constructor(e,t,o,r,s,n,a,i,l=3600){this.id=e,this.scrape_stats=i,this.searchResultsUrl=t,this.outQueue=r,this.logger=s,this.pageQueue=n,this.timeout=l,this.topLevelFields=a;const{promise:c,clear:d}=function(e,t){let o;return{promise:new Promise((r,s)=>{o=setTimeout(()=>s(t),1e3*e)}),clear:()=>clearTimeout(o)}}(l,"Timeout");this.timeoutPromise=c.then(()=>s.error(`Scrape op ${e} timed out.`)),this.timeoutClear=d,this.cloudNodePort=o}assemble_cookie(e){return Object.entries(e.cookie).map(([e,t])=>`${e}=${t}`).join("; ")}get_dict(e){const t={};for(let o=0;o<e.length;o++)t[e[o].name]=e[o].value;return t}async scrape_job_details(e,t,o,r,s,n){let a=0;let i=!1,l=s.cookie.cookie;const c={accept:"*/*","accept-language":"en-GB,en-US;q=0.9,en;q=0.8","content-type":"application/json",origin:"https://hk.jobsdb.com",Connection:"keep-alive",Cookie:this.assemble_cookie(s.cookie),priority:"u=1, i",referer:s.url.href,"sec-fetch-dest":"empty","sec-fetch-mode":"cors","sec-fetch-site":"same-origin","seek-request-brand":"jobsdb","seek-request-country":"HK","user-agent":o,"x-seek-ec-sessionid":l.JobseekerSessionId??"","x-seek-ec-visitorid":l.JobseekerVisitorId??"","x-seek-site":"chalice"},d={operationName:"jobDetails",variables:{jobId:r,jobDetailsViewedCorrelationId:j(),sessionId:l.JobseekerSessionId??"",zone:"asia-1",locale:"en-HK",languageCode:"en",countryCode:"HK",timezone:"America/New_York"},query:'query jobDetails($jobId: ID!, $jobDetailsViewedCorrelationId: String!, $sessionId: String!, $zone: Zone!, $locale: Locale!, $languageCode: LanguageCodeIso!, $countryCode: CountryCodeIso2!, $timezone: Timezone!) {\n jobDetails(\n id: $jobId\n tracking: {channel: "WEB", jobDetailsViewedCorrelationId: $jobDetailsViewedCorrelationId, sessionId: $sessionId}\n ) {\n job {\n sourceZone\n id\n title\n phoneNumber\n isExpired\n expiresAt {\n dateTimeUtc\n }\n isLinkOut\n contactMatches {\n type\n value\n }\n isVerified\n abstract\n content(platform: WEB)\n status\n listedAt {\n label(context: JOB_POSTED, length: SHORT, timezone: $timezone, locale: $locale)\n dateTimeUtc\n }\n salary {\n currencyLabel(zone: $zone)\n label\n }\n shareLink(platform: WEB, zone: $zone, locale: $locale)\n workTypes {\n label(locale: $locale)\n }\n advertiser {\n id\n name(locale: $locale)\n isVerified\n registrationDate {\n dateTimeUtc\n }\n }\n location {\n label(locale: $locale, type: LONG)\n }\n classifications {\n label(languageCode: $languageCode)\n }\n products {\n branding {\n id\n cover {\n url\n }\n thumbnailCover: cover(isThumbnail: true) {\n url\n }\n logo {\n url\n }\n }\n bullets\n questionnaire {\n questions\n }\n video {\n url\n position\n }\n }\n }\n companyProfile(zone: $zone) {\n id\n name\n companyNameSlug\n shouldDisplayReviews\n branding {\n logo\n }\n overview {\n description {\n paragraphs\n }\n industry\n size {\n description\n }\n website {\n url\n }\n }\n reviewsSummary {\n overallRating {\n numberOfReviews {\n value\n }\n value\n }\n }\n perksAndBenefits {\n title\n }\n }\n companySearchUrl(zone: $zone, languageCode: $languageCode)\n companyTags {\n key(languageCode: $languageCode)\n value\n }\n restrictedApplication(countryCode: $countryCode) {\n label(locale: $locale)\n }\n }\n}'};for(;!i&&a<3;)try{const o=await t.fetch("https://hk.jobsdb.com/graphql",{method:"POST",headers:c,body:JSON.stringify(d)});if(200!==await o.status)throw this.logger.error(`Hero ${this.id}.${e} encountered error status ${await o.status} on job fetch for p${s.url.searchParams.get("page")}`),new Error(`Hero ${this.id}.${e} encountered error status ${await o.status} on job fetch for p${s.url.searchParams.get("page")}`);const r=(await o.json()).data.jobDetails.job;Object.keys(C(r,{delimiter:"_"})).forEach(e=>{this.topLevelFields.has(e)||this.topLevelFields.add(e)}),r.content=D(r.content),this.outQueue.push(r,t=>{if(t)throw this.logger.error(`Hero ${this.id}.${e}'s queue push failed:, ${t.message}`),new Error(`Hero ${this.id}.${e}'s queue push failed:, ${t.message}`)}),i=!0,a++}catch(t){this.logger.error(`Hero ${this.id}.${e} failed to scrape job ${r} on page ${s.url.searchParams.get("page")} on attempt #${a}: ${t.toString()}`)}if(n.release(),3==a)throw this.logger.error(`Hero ${this.id}.${e} failed to scrape job ${r} on page ${s.url.searchParams.get("page")}`),new Error(`Hero ${this.id}.${e} failed to scrape job ${r} on page ${s.url.searchParams.get("page")}`)}async scrape_page_job_details(e,t,o,r,s){let n=[];for(let a of s.jobIds)await o.acquire(),n.push(this.scrape_job_details(e,t,r,a,s,o));await Promise.all(n)}async startWorker(e){const t=new i({noChromeSandbox:!0,blockedResourceTypes:["All"],connectionToCore:{host:`localhost:${this.cloudNodePort}`}});let o=0,r=0;const s=(await t.meta).userAgentString;this.logger.info(`Hero instance ${this.id}.${e} started`);const n=new S(8);try{for(;!this.pageQueue.isEmpty();){let a=[];const i=this.pageQueue.dequeue();this.searchResultsUrl.searchParams.set("page",String(i)),this.logger.info(`Hero ${this.id}.${e} dequeued page ${i}`),await t.goto(this.searchResultsUrl.href),await t.waitForLoad("DomContentLoaded");let l=t.querySelectorAll("article[data-job-id]"),c=await l.length;for(let e=0;e<c;e++){const t=l.item(e);await t.$waitForVisible({timeoutMs:1e4});const o=await t.getAttribute("data-job-id");a.push(o)}if(0===a.length)break;const d=new q(this.get_dict(await t.activeTab.cookieStorage.getItems())),u=new H(this.searchResultsUrl,d,a);await this.scrape_page_job_details(e,t,n,s,u),o++,r+=a.length,this.scrape_stats.totalPagesScraped++,this.scrape_stats.totalJobsScraped+=a.length,this.logger.info(`Hero ${this.id}.${e} successfully scraped page ${i} with ${a.length} jobs`)}this.logger.info(`Hero instance ${this.id}.${e} closed, scraped ${o} pages and ${r} jobs.`)}catch(o){throw this.logger.info(`Hero ${this.id}.${e} failed on ${await t.activeTab.url}`),o}finally{await t.close()}}async scrape_all_jobs(){const e=[];let t=Math.min(this.pageQueue.size(),10);for(let o=0;o<t;o++)e.push(this.startWorker(o));await Promise.all(e)}async __start__(){try{this.logger.info(`Starting scrape operation ${this.id}, using cloud node on port ${this.cloudNodePort}`);if("Timeout"===await Promise.race([this.scrape_all_jobs(),this.timeoutPromise]))throw new Error(`Scrape operation ${this.id} timed out after ${this.timeout} seconds.`)}catch(e){throw this.logger.error(`Scrape operation ${this.id} failed: ${e.message}`),e}finally{this.timeoutClear()}}},M="./jobsdb_scrape_logs";function W(e){return new Promise((t,o)=>{e.stdout?.once("data",e=>{try{const o=parseInt(e.toString());t(o)}catch(e){o(e)}})})}var V=y(P(import.meta.url)),B=y(V),Q="true"===process.env.LOG_ENABLED;Q?function(e){if(d.existsSync(e))try{d.rmSync(e,{recursive:!0,force:!0})}catch(e){console.error("Error while removing logs directory:",e)}}(b.join(B,"jobsdb_scrape_logs")):process.removeAllListeners("warning");var J,K,Y={totalJobsScraped:0,totalPagesScraped:0},G=function(e,t){if(!t)return T({level:"silent"});u.mkdirSync(M,{recursive:!0});const o=(new Date).toISOString().replace(/[:.]/g,"-"),r=`${M}/${e}-${o}.log`,s=T.destination({dest:r,sync:!1});return T({level:"info",name:e},s)}("client",Q),Z=new Set,X=[],ee=0,te=new k,oe=[],re=[],se=[],ne=Date.now()/1e3;async function ae(e){const t=e.maxPages,o=e.searchResultsUrl,r=e.numPages;if(!e.resultFileName){const t=g.utc().toFormat("yyyy-MM-dd_HH:mm:ss");e.resultFileName=`jobsdb-${o.hostname.substring(0,2)}-${r}-${t}`}const s=`${e.resultFileName}.ndjson`;let n=b.join(e.saveDir,s);for(let e=1;e<=r;e++)te.enqueue(e);var a,i;a=n,J=d.createWriteStream(a,{flags:"a",flush:!0,highWaterMark:67108864}),i=J,K=p.queue((e,t)=>{try{const o=JSON.stringify(e)+"\n";i.write(o)?t():i.once("drain",()=>{i.write(o)?t():t(new Error("Write failed after drain"))})}catch(e){t(e)}},1),J.on("error",e=>{throw G.error(`WriteStream error: ${e.message}`),new Error(`WriteStream error: ${e.message}`)}),ee=r>10?2:1;try{for(let e=0;e<ee;e++){const t=_("node",[b.join(B,"src/cloudnode.js"),String(e)],{shell:!1});G.info(`Starting cloudnode ${e}...`),X.push(t)}for(let e=0;e<ee;e++)se.push(await W(X[e])),G.info(`Cloudnode ${e} started on port ${se[e]}`);for(let e=0;e<ee;e++)oe.push(new U(e,o,se[e],K,G.child({module:`scrapeOp${e}`}),te,Z,Y)),re.push(oe[e].__start__()),G.info(`Scrape operation ${e} initialized`);let s=!1;L(`Scraping ${r}/${t} available pages of jobs on ${o.href}.`),G.info(`Scraping ${r}/${t} available pages of jobs on ${o.href}.`);const a=Promise.all(re).finally(()=>{s=!0,te.isEmpty()||console.error("\nCouldn't complete scraping operation at this time, try again in ~1min, if still persists, please file an issue on github")});for(process.env.IS_LIBRARY_CALL&&await a;!s;)x(r-te.size(),r),await N(1e3);return L(),G.info("All scrape operations completed."),await async function(e){return new Promise(t=>{0===e.length()&&0===e.running()?t():e.drain(t)})}(K),G.info("Object queue drained."),await async function(e){return new Promise(t=>{e.once("finish",t),e.end()})}(J),G.info("Outstream closed."),"csv"===e.format&&(await async function(e,t){const o=e.replace(".ndjson",".csv"),r=d.createReadStream(e,{encoding:"utf-8"}),s=d.createWriteStream(o,{encoding:"utf-8"}),n={ndjson:!0,defaultValue:null,transforms:[w({objects:!0,arrays:!0,separator:"_"}),A],fields:[...t]},a=new f(n,{},{});await m(r,a,s)}(n,Z),h(n),n=n.replace(".ndjson",".csv")),L(`Scrape finished in ${Math.floor(Date.now()/1e3-ne)} seconds`),L(`Result file saved to ./${n}`),G.info(`Result file saved to ./${n}`),G.info(`Scrape finished in ${Math.floor(Date.now()/1e3-ne)} seconds`),{resultPath:n,scrape_stats:Y}}catch(e){"EACCES"===e.code?(console.error("The specified result directory does not have write permissions."),G.error("The specified result directory does not have write permissions.")):(console.error("scrape_jobsdb.ts in main:",e),G.error(`Error during scraping: ${e.message}`))}finally{for(let e=0;e<ee;e++)X.length>0&&(G.info(`Shutting down CloudNode ${e} on port ${se[e]}...`),!1===X[e].kill()&&(console.error("Error during CloudNode shutdown"),G.error(`Error during CloudNode ${e} shutdown`)))}}e.command("maxPages").description("Find maximum number of pages available to scrape for a given search results url").addArgument(new t("[searchResultsUrl]","The job listing results url (e.g. https://hk.jobsdb.com/jobs)")).action(async e=>{try{const t=await O(e),o=await R(t);console.log(`Found ${o} pages available to scrape on ${e}`)}catch(e){console.error(e.message)}}),e.command("scrape",{isDefault:!0}).description("Scrape job listings").addArgument(new t("[searchResultsUrl]","The job listing results url (e.g. https://hk.jobsdb.com/jobs)")).addOption(new o("-n, --numPages <number>","Number of pages to scrape").default("all")).addOption(new o("--fn, --resultFileName <saveFileName>","Name of the results save file (without extension)")).addOption(new o("-f, --format <file_format>","File format to use, csv files fields are completely flattened (including arrays), and date/time fields are normalized to SQL DateTime.").default("ndjson").choices(["ndjson","csv"])).addOption(new o("-s, --saveDir <pathToDir>","Relative path directory from where the program is run to store results file (optional)").default("./jobsdb_scrape_results")).action(async(e,t)=>{try{const o=z(t.saveDir);t.resultFileName&&function(e,t){if(!v(`${e}.${t}`))throw new r(`Provided filename "${e}" is invalid`)}(t.resultFileName,t.format);const s=await O(e),[n,a]=await async function(e,t){const o=await R(t);if(-1==o)throw new Error("\nCouldn't find the pages available to scrape, please file an issue on github");if("all"==e)return[o,o];const s=parseInt(e);if(isNaN(s))throw new r("Provided numPages arg is not a number.");if(s<1)throw new r("Provided numPages arg must be >=1");if(o<s)throw new r(`Provided numPages args must be <= ${o}`);return[s,o]}(t.numPages,s);t.numPages=n,t.maxPages=a,t.saveDir=o,t.searchResultsUrl=s}catch(e){return void console.error(e.message)}await ae(t)}),(async()=>{await e.parseAsync(process.argv)})();
@@ -0,0 +1,2 @@
1
+
2
+ export { }
@@ -0,0 +1 @@
1
+ import{CloudNode as o}from"@ulixee/cloud";import{CorePlugin as e}from"@ulixee/hero-plugin-utils";import r from"fs";import n from"pino";var s=class extends e{static id="NoSandboxPlugin";onNewBrowser(o,e){this.browserEngine.launchArguments.push("--no-sandbox","--disable-setuid-sandbox")}},t="./jobsdb_scrape_logs";var i=parseInt(process.argv[2]),l="true"===process.env.LOG_ENABLED,c=function(o,e){if(!e)return n({level:"silent"});r.mkdirSync(t,{recursive:!0});const s=(new Date).toISOString().replace(/[:.]/g,"-"),i=`${t}/${o}-${s}.log`,l=n.destination({dest:i,sync:!1});return n({level:"info",name:o},l)}(`cloudnode-${i}`,l);!async function(){const e=new o({heroConfiguration:{disableSessionPersistence:!0}});e.heroCore.use(s);try{await e.listen(),c.info(`CloudNode ${i} listening on port ${await e.port}`),console.log(await e.port)}catch(o){console.error("ERROR starting Ulixee CloudNode",o),c.error("ERROR starting Ulixee CloudNode\n"+o),await e.close(),process.exit(1)}const r=async()=>{try{await e.close(),c.info(`CloudNode ${i} shut down gracefully.`),console.log("CloudNode shut down gracefully..."),process.exit(0)}catch(o){console.error(`Error during CloudNode shutdown + ${o}`),c.error("Error during CloudNode shutdown\n"+o),process.exit(1)}};process.on("SIGTERM",r),process.on("SIGINT",r)}();
@@ -0,0 +1,12 @@
1
+ import * as fs from 'fs';
2
+ import async from 'async';
3
+
4
+ declare function closeStream(stream: fs.WriteStream): Promise<unknown>;
5
+ declare function getWriteStream(filePath: string): fs.WriteStream;
6
+ declare function drainQueue(queue: async.QueueObject<Object>): Promise<void>;
7
+ declare function getWriteQueue(writeStream: fs.WriteStream): async.QueueObject<Object>;
8
+ declare function convertNdjsonToCsv(ndjsonFilePath: string, topLevelFields: Set<string>): Promise<void>;
9
+ declare function clean_dir(dirname: string): void;
10
+ declare function createDir(folderPath: string): void;
11
+
12
+ export { clean_dir, closeStream, convertNdjsonToCsv, createDir, drainQueue, getWriteQueue, getWriteStream };
@@ -0,0 +1 @@
1
+ import*as r from"fs";import{pipeline as n}from"stream/promises";import e from"async";import{Transform as t}from"@json2csv/node";import{DateTime as o}from"luxon";import{flatten as i}from"@json2csv/transforms";function s(r){if(!r)return;const n=o.fromISO(r);return n.isValid?n.toFormat("yyyy-MM-dd HH:mm:ss"):void 0}async function c(r){return new Promise(n=>{r.once("finish",n),r.end()})}function a(n){return r.createWriteStream(n,{flags:"a",flush:!0,highWaterMark:67108864})}async function f(r){return new Promise(n=>{0===r.length()&&0===r.running()?n():r.drain(n)})}function m(r){return e.queue((n,e)=>{try{const t=JSON.stringify(n)+"\n";r.write(t)?e():r.once("drain",()=>{r.write(t)?e():e(new Error("Write failed after drain"))})}catch(r){e(r)}},1)}async function u(e,o){const s=e.replace(".ndjson",".csv"),c=r.createReadStream(e,{encoding:"utf-8"}),a=r.createWriteStream(s,{encoding:"utf-8"}),f={ndjson:!0,defaultValue:null,transforms:[i({objects:!0,arrays:!0,separator:"_"}),d],fields:[...o]},m=new t(f,{},{});await n(c,m,a)}function d(r){for(const n in r){const e=r[n];if("string"==typeof e&&e.match(/^\d{4}-\d{2}-\d{2}/))try{r[n]=s(e)}catch(t){console.warn(`Failed to parse date for key "${n}": ${e}, error: ${t}`),r[n]=null}}return r}function y(n){if(r.existsSync(n))try{r.rmSync(n,{recursive:!0,force:!0})}catch(r){console.error("Error while removing logs directory:",r)}}function l(n){r.existsSync(n)||r.mkdirSync(n,{recursive:!0})}export{y as clean_dir,c as closeStream,u as convertNdjsonToCsv,l as createDir,f as drainQueue,m as getWriteQueue,a as getWriteStream};
@@ -0,0 +1,5 @@
1
+ import pino from 'pino';
2
+
3
+ declare function createLogger(name: string, enableLogging: boolean): pino.Logger;
4
+
5
+ export { createLogger };
@@ -0,0 +1 @@
1
+ import e from"fs";import r from"pino";var n="./jobsdb_scrape_logs";function o(o,t){if(!t)return r({level:"silent"});e.mkdirSync(n,{recursive:!0});const i=(new Date).toISOString().replace(/[:.]/g,"-"),s=`${n}/${o}-${i}.log`,l=r.destination({dest:s,sync:!1});return r({level:"info",name:o},l)}export{o as createLogger};
@@ -0,0 +1,10 @@
1
+ import Hero from '@ulixee/hero';
2
+
3
+ declare function parseArguments(options: any): Promise<void>;
4
+ declare function parseFilename(filename: string, format: string): void;
5
+ declare function parseSearchUrl(url: string): Promise<URL>;
6
+ declare function parseSaveDir(dirPath: string): string;
7
+ declare function parseFormat(fmt: string): void;
8
+ declare function parseNumPages(numPages: string, searchResultsUrl: URL, heroes?: Hero[]): Promise<number[]>;
9
+
10
+ export { parseArguments, parseFilename, parseFormat, parseNumPages, parseSaveDir, parseSearchUrl };
@@ -0,0 +1 @@
1
+ import{InvalidArgumentError as e}from"commander";import*as t from"fs";import{compile as o}from"html-to-text";import r from"@ulixee/hero-core";import{TransportBridge as n}from"@ulixee/net";import s,{ConnectionToHeroCore as a}from"@ulixee/hero";import{CorePlugin as i}from"@ulixee/hero-plugin-utils";import"stream/promises";import"async";import"@json2csv/node";import"luxon";import"@json2csv/transforms";import c from"path";import l from"valid-filename";var u=class extends i{static id="NoSandboxPlugin";onNewBrowser(e,t){this.browserEngine.launchArguments.push("--no-sandbox","--disable-setuid-sandbox")}};async function h(e,t){await e.goto(t),await e.waitForLoad("DomContentLoaded");const o=e.querySelector('section[data-automation="search-zero-results"], article[data-job-id]');try{await o.$waitForVisible({timeoutMs:1e4})}catch(e){throw new Error(`Fatal Error: Unable to determine whether ${t} is has search results`)}return!o||"SECTION"===await o.tagName}async function m(e,t){let o;((...e)=>{process.env.IS_LIBRARY_CALL||console.log(...e)})("Finding number of pages available to scrape on %s",e.href);let i=!1;try{if(void 0===t){i=!0;const e=new n,c=new a(e.transportToCore);o=new r({disableSessionPersistence:!0}),o.use(u),o.addConnection(e.transportToClient),t=new s({noChromeSandbox:!0,blockedResourceTypes:["All"],showChrome:!1,connectionToCore:c})}let c=1,l=1e3,m=new URL(e.href);for(;c<=l;){let e=Math.trunc((c+l)/2);m.searchParams.set("page",e.toString()),await h(t,m.href)?l=e-1:c=e+1}return c-1}catch(e){throw e}finally{i&&(await t.close(),await o.close())}}async function w(e){e.searchResultsUrl=await f(e.searchResultsUrlString),e.saveDir=p(e.saveDir),e.format=g(e.format);const[t,o]=await b(e.numPages,e.searchResultsUrl);e.numPages=t,e.maxPages=o}function d(t,o){if(!l(`${t}.${o}`))throw new e(`Provided filename "${t}" is invalid`)}async function f(t){const o=["hk.jobsdb.com","th.jobsdb.com"];if(!t||"string"!=typeof t||0===t.length)throw new e("URL must be a non-empty string");let i,c=null,l=null;try{if(o.includes(t)&&(t+="/jobs"),t.startsWith("https://")||(t="https://"+t),i=new URL(t),!o.includes(i.hostname))throw new e(`Invalid search url, urls must have either of the following hostnames [${o}] and point to a valid search results page`);const m=new n;l=new r({disableSessionPersistence:!0}),l.addConnection(m.transportToClient),l.use(u),c=new s({noChromeSandbox:!0,blockedResourceTypes:["All"],connectionToCore:new a(m.transportToCore)});if(await h(c,i.href))throw new e(`Invalid search url, urls must have either of the following hostnames [${o}] and point to a valid search results page`);return i}catch(e){throw e}finally{c&&await c.close(),l&&await l.close()}}function p(o){if("string"!=typeof o||""===o.trim())throw new e("Save directory path must be a non-empty string");const r=o.trim(),n=c.join(process.cwd(),r);if(t.existsSync(n)){if(!t.statSync(n).isDirectory())throw new e(`Save directory path exists but is not a directory: ${r}`);try{t.accessSync(n,t.constants.W_OK)}catch{throw new e(`Provided directory exists but is not writable: ${r}`)}}else s=n,t.existsSync(s)||t.mkdirSync(s,{recursive:!0});var s;return r}function g(t){const o=["ndjson","csv"];if(!o.includes(t))throw new e(`File format must be one of the following: ${o}`)}async function b(t,o,r){const n=await m(o);if(-1==n)throw new Error("\nCouldn't find the pages available to scrape, please file an issue on github");if("all"==t)return[n,n];const s=parseInt(t);if(isNaN(s))throw new e("Provided numPages arg is not a number.");if(s<1)throw new e("Provided numPages arg must be >=1");if(n<s)throw new e(`Provided numPages args must be <= ${n}`);return[s,n]}o({});export{w as parseArguments,d as parseFilename,g as parseFormat,b as parseNumPages,p as parseSaveDir,f as parseSearchUrl};
@@ -0,0 +1,13 @@
1
+ import { ScrapeOptions, ScrapeStats } from './types.js';
2
+
3
+ declare function findMaxPages(searchResultsUrlString: string): Promise<number>;
4
+ declare function scrapeJobsdb(options: ScrapeOptions): Promise<{
5
+ resultPath: string;
6
+ scrape_stats: ScrapeStats;
7
+ } | undefined>;
8
+ declare function main(options: ScrapeOptions): Promise<{
9
+ resultPath: string;
10
+ scrape_stats: ScrapeStats;
11
+ } | undefined>;
12
+
13
+ export { findMaxPages, main, scrapeJobsdb };
@@ -0,0 +1 @@
1
+ import e,{ConnectionToHeroCore as t}from"@ulixee/hero";import{compile as o}from"html-to-text";import r from"@ulixee/hero-core";import{TransportBridge as s}from"@ulixee/net";import{CorePlugin as n}from"@ulixee/hero-plugin-utils";import{v4 as i}from"uuid";import{Sema as a}from"async-sema";import{flatten as l}from"flat";import{spawn as c}from"child_process";import*as u from"path";import d,{dirname as h}from"path";import{fileURLToPath as m}from"url";import{DateTime as p}from"luxon";import f from"queue-fifo";import*as g from"fs";import w,{unlinkSync as $}from"fs";import b from"pino";import{pipeline as y}from"stream/promises";import v from"async";import{Transform as S}from"@json2csv/node";import{flatten as j}from"@json2csv/transforms";import{InvalidArgumentError as C}from"commander";import"valid-filename";var _=class extends n{static id="NoSandboxPlugin";onNewBrowser(e,t){this.browserEngine.launchArguments.push("--no-sandbox","--disable-setuid-sandbox")}};function P(e,t,o=40){if(e===t)return;const r=Math.min(e/t,1),s=Math.round(r*o),n="█".repeat(s)+"-".repeat(o-s),i=(100*r).toFixed(2);process.stdout.isTTY?(process.stdout.clearLine(0),process.stdout.cursorTo(0),process.stdout.write(`Progress: [${n}] ${i}%`),e>=t&&process.stdout.write("\n")):console.log(`Progress: [${n}] ${i}%`)}function k(e){return new Promise(t=>setTimeout(t,e))}var I=(...e)=>{process.env.IS_LIBRARY_CALL||console.log(...e)};var L=o({});async function T(e,t){await e.goto(t),await e.waitForLoad("DomContentLoaded");const o=e.querySelector('section[data-automation="search-zero-results"], article[data-job-id]');try{await o.$waitForVisible({timeoutMs:1e4})}catch(e){throw new Error(`Fatal Error: Unable to determine whether ${t} is has search results`)}return!o||"SECTION"===await o.tagName}async function R(o,n){let i;I("Finding number of pages available to scrape on %s",o.href);let a=!1;try{if(void 0===n){a=!0;const o=new s,l=new t(o.transportToCore);i=new r({disableSessionPersistence:!0}),i.use(_),i.addConnection(o.transportToClient),n=new e({noChromeSandbox:!0,blockedResourceTypes:["All"],showChrome:!1,connectionToCore:l})}let l=1,c=1e3,u=new URL(o.href);for(;l<=c;){let e=Math.trunc((l+c)/2);u.searchParams.set("page",e.toString()),await T(n,u.href)?c=e-1:l=e+1}return l-1}catch(e){throw e}finally{a&&(await n.close(),await i.close())}}var x=class{cookie;constructor(e={}){this.cookie=e}},E=class{jobIds;url;cookie;constructor(e,t=new x,o=[]){this.url=e,this.jobIds=o,this.cookie=t}},A=class{id;searchResultsUrl;cloudNodePort;pageQueue;timeout;logger;scrape_stats;topLevelFields;outQueue;timeoutPromise;timeoutClear;constructor(e,t,o,r,s,n,i,a,l=3600){this.id=e,this.scrape_stats=a,this.searchResultsUrl=t,this.outQueue=r,this.logger=s,this.pageQueue=n,this.timeout=l,this.topLevelFields=i;const{promise:c,clear:u}=function(e,t){let o;return{promise:new Promise((r,s)=>{o=setTimeout(()=>s(t),1e3*e)}),clear:()=>clearTimeout(o)}}(l,"Timeout");this.timeoutPromise=c.then(()=>s.error(`Scrape op ${e} timed out.`)),this.timeoutClear=u,this.cloudNodePort=o}assemble_cookie(e){return Object.entries(e.cookie).map(([e,t])=>`${e}=${t}`).join("; ")}get_dict(e){const t={};for(let o=0;o<e.length;o++)t[e[o].name]=e[o].value;return t}async scrape_job_details(e,t,o,r,s,n){let a=0;let c=!1,u=s.cookie.cookie;const d={accept:"*/*","accept-language":"en-GB,en-US;q=0.9,en;q=0.8","content-type":"application/json",origin:"https://hk.jobsdb.com",Connection:"keep-alive",Cookie:this.assemble_cookie(s.cookie),priority:"u=1, i",referer:s.url.href,"sec-fetch-dest":"empty","sec-fetch-mode":"cors","sec-fetch-site":"same-origin","seek-request-brand":"jobsdb","seek-request-country":"HK","user-agent":o,"x-seek-ec-sessionid":u.JobseekerSessionId??"","x-seek-ec-visitorid":u.JobseekerVisitorId??"","x-seek-site":"chalice"},h={operationName:"jobDetails",variables:{jobId:r,jobDetailsViewedCorrelationId:i(),sessionId:u.JobseekerSessionId??"",zone:"asia-1",locale:"en-HK",languageCode:"en",countryCode:"HK",timezone:"America/New_York"},query:'query jobDetails($jobId: ID!, $jobDetailsViewedCorrelationId: String!, $sessionId: String!, $zone: Zone!, $locale: Locale!, $languageCode: LanguageCodeIso!, $countryCode: CountryCodeIso2!, $timezone: Timezone!) {\n jobDetails(\n id: $jobId\n tracking: {channel: "WEB", jobDetailsViewedCorrelationId: $jobDetailsViewedCorrelationId, sessionId: $sessionId}\n ) {\n job {\n sourceZone\n id\n title\n phoneNumber\n isExpired\n expiresAt {\n dateTimeUtc\n }\n isLinkOut\n contactMatches {\n type\n value\n }\n isVerified\n abstract\n content(platform: WEB)\n status\n listedAt {\n label(context: JOB_POSTED, length: SHORT, timezone: $timezone, locale: $locale)\n dateTimeUtc\n }\n salary {\n currencyLabel(zone: $zone)\n label\n }\n shareLink(platform: WEB, zone: $zone, locale: $locale)\n workTypes {\n label(locale: $locale)\n }\n advertiser {\n id\n name(locale: $locale)\n isVerified\n registrationDate {\n dateTimeUtc\n }\n }\n location {\n label(locale: $locale, type: LONG)\n }\n classifications {\n label(languageCode: $languageCode)\n }\n products {\n branding {\n id\n cover {\n url\n }\n thumbnailCover: cover(isThumbnail: true) {\n url\n }\n logo {\n url\n }\n }\n bullets\n questionnaire {\n questions\n }\n video {\n url\n position\n }\n }\n }\n companyProfile(zone: $zone) {\n id\n name\n companyNameSlug\n shouldDisplayReviews\n branding {\n logo\n }\n overview {\n description {\n paragraphs\n }\n industry\n size {\n description\n }\n website {\n url\n }\n }\n reviewsSummary {\n overallRating {\n numberOfReviews {\n value\n }\n value\n }\n }\n perksAndBenefits {\n title\n }\n }\n companySearchUrl(zone: $zone, languageCode: $languageCode)\n companyTags {\n key(languageCode: $languageCode)\n value\n }\n restrictedApplication(countryCode: $countryCode) {\n label(locale: $locale)\n }\n }\n}'};for(;!c&&a<3;)try{const o=await t.fetch("https://hk.jobsdb.com/graphql",{method:"POST",headers:d,body:JSON.stringify(h)});if(200!==await o.status)throw this.logger.error(`Hero ${this.id}.${e} encountered error status ${await o.status} on job fetch for p${s.url.searchParams.get("page")}`),new Error(`Hero ${this.id}.${e} encountered error status ${await o.status} on job fetch for p${s.url.searchParams.get("page")}`);const r=(await o.json()).data.jobDetails.job;Object.keys(l(r,{delimiter:"_"})).forEach(e=>{this.topLevelFields.has(e)||this.topLevelFields.add(e)}),r.content=L(r.content),this.outQueue.push(r,t=>{if(t)throw this.logger.error(`Hero ${this.id}.${e}'s queue push failed:, ${t.message}`),new Error(`Hero ${this.id}.${e}'s queue push failed:, ${t.message}`)}),c=!0,a++}catch(t){this.logger.error(`Hero ${this.id}.${e} failed to scrape job ${r} on page ${s.url.searchParams.get("page")} on attempt #${a}: ${t.toString()}`)}if(n.release(),3==a)throw this.logger.error(`Hero ${this.id}.${e} failed to scrape job ${r} on page ${s.url.searchParams.get("page")}`),new Error(`Hero ${this.id}.${e} failed to scrape job ${r} on page ${s.url.searchParams.get("page")}`)}async scrape_page_job_details(e,t,o,r,s){let n=[];for(let i of s.jobIds)await o.acquire(),n.push(this.scrape_job_details(e,t,r,i,s,o));await Promise.all(n)}async startWorker(t){const o=new e({noChromeSandbox:!0,blockedResourceTypes:["All"],connectionToCore:{host:`localhost:${this.cloudNodePort}`}});let r=0,s=0;const n=(await o.meta).userAgentString;this.logger.info(`Hero instance ${this.id}.${t} started`);const i=new a(8);try{for(;!this.pageQueue.isEmpty();){let e=[];const a=this.pageQueue.dequeue();this.searchResultsUrl.searchParams.set("page",String(a)),this.logger.info(`Hero ${this.id}.${t} dequeued page ${a}`),await o.goto(this.searchResultsUrl.href),await o.waitForLoad("DomContentLoaded");let l=o.querySelectorAll("article[data-job-id]"),c=await l.length;for(let t=0;t<c;t++){const o=l.item(t);await o.$waitForVisible({timeoutMs:1e4});const r=await o.getAttribute("data-job-id");e.push(r)}if(0===e.length)break;const u=new x(this.get_dict(await o.activeTab.cookieStorage.getItems())),d=new E(this.searchResultsUrl,u,e);await this.scrape_page_job_details(t,o,i,n,d),r++,s+=e.length,this.scrape_stats.totalPagesScraped++,this.scrape_stats.totalJobsScraped+=e.length,this.logger.info(`Hero ${this.id}.${t} successfully scraped page ${a} with ${e.length} jobs`)}this.logger.info(`Hero instance ${this.id}.${t} closed, scraped ${r} pages and ${s} jobs.`)}catch(e){throw this.logger.info(`Hero ${this.id}.${t} failed on ${await o.activeTab.url}`),e}finally{await o.close()}}async scrape_all_jobs(){const e=[];let t=Math.min(this.pageQueue.size(),10);for(let o=0;o<t;o++)e.push(this.startWorker(o));await Promise.all(e)}async __start__(){try{this.logger.info(`Starting scrape operation ${this.id}, using cloud node on port ${this.cloudNodePort}`);if("Timeout"===await Promise.race([this.scrape_all_jobs(),this.timeoutPromise]))throw new Error(`Scrape operation ${this.id} timed out after ${this.timeout} seconds.`)}catch(e){throw this.logger.error(`Scrape operation ${this.id} failed: ${e.message}`),e}finally{this.timeoutClear()}}},N="./jobsdb_scrape_logs";function D(e){return new Promise((t,o)=>{e.stdout?.once("data",e=>{try{const o=parseInt(e.toString());t(o)}catch(e){o(e)}})})}function q(e){if(!e)return;const t=p.fromISO(e);return t.isValid?t.toFormat("yyyy-MM-dd HH:mm:ss"):void 0}function z(e){for(const t in e){const o=e[t];if("string"==typeof o&&o.match(/^\d{4}-\d{2}-\d{2}/))try{e[t]=q(o)}catch(r){console.warn(`Failed to parse date for key "${t}": ${o}, error: ${r}`),e[t]=null}}return e}async function H(e){e.searchResultsUrl=await O(e.searchResultsUrlString),e.saveDir=function(e){if("string"!=typeof e||""===e.trim())throw new C("Save directory path must be a non-empty string");const t=e.trim(),o=d.join(process.cwd(),t);if(g.existsSync(o)){if(!g.statSync(o).isDirectory())throw new C(`Save directory path exists but is not a directory: ${t}`);try{g.accessSync(o,g.constants.W_OK)}catch{throw new C(`Provided directory exists but is not writable: ${t}`)}}else r=o,g.existsSync(r)||g.mkdirSync(r,{recursive:!0});var r;return t}(e.saveDir),e.format=function(e){const t=["ndjson","csv"];if(!t.includes(e))throw new C(`File format must be one of the following: ${t}`)}(e.format);const[t,o]=await async function(e,t){const o=await R(t);if(-1==o)throw new Error("\nCouldn't find the pages available to scrape, please file an issue on github");if("all"==e)return[o,o];const r=parseInt(e);if(isNaN(r))throw new C("Provided numPages arg is not a number.");if(r<1)throw new C("Provided numPages arg must be >=1");if(o<r)throw new C(`Provided numPages args must be <= ${o}`);return[r,o]}(e.numPages,e.searchResultsUrl);e.numPages=t,e.maxPages=o}async function O(o){const n=["hk.jobsdb.com","th.jobsdb.com"];if(!o||"string"!=typeof o||0===o.length)throw new C("URL must be a non-empty string");let i,a=null,l=null;try{if(n.includes(o)&&(o+="/jobs"),o.startsWith("https://")||(o="https://"+o),i=new URL(o),!n.includes(i.hostname))throw new C(`Invalid search url, urls must have either of the following hostnames [${n}] and point to a valid search results page`);const c=new s;l=new r({disableSessionPersistence:!0}),l.addConnection(c.transportToClient),l.use(_),a=new e({noChromeSandbox:!0,blockedResourceTypes:["All"],connectionToCore:new t(c.transportToCore)});if(await T(a,i.href))throw new C(`Invalid search url, urls must have either of the following hostnames [${n}] and point to a valid search results page`);return i}catch(e){throw e}finally{a&&await a.close(),l&&await l.close()}}var F=h(m(import.meta.url)),U=h(F),M="true"===process.env.LOG_ENABLED;M?function(e){if(g.existsSync(e))try{g.rmSync(e,{recursive:!0,force:!0})}catch(e){console.error("Error while removing logs directory:",e)}}(u.join(U,"jobsdb_scrape_logs")):process.removeAllListeners("warning");var W,B,V={totalJobsScraped:0,totalPagesScraped:0},J=function(e,t){if(!t)return b({level:"silent"});w.mkdirSync(N,{recursive:!0});const o=(new Date).toISOString().replace(/[:.]/g,"-"),r=`${N}/${e}-${o}.log`,s=b.destination({dest:r,sync:!1});return b({level:"info",name:e},s)}("client",M),Q=new Set,Y=[],K=0,G=new f,Z=[],X=[],ee=[],te=Date.now()/1e3;async function oe(e){process.env.IS_LIBRARY_CALL="true";const t=await O(e);return await R(t)}async function re(e){return process.env.IS_LIBRARY_CALL="true",await H(e),await se(e)}async function se(e){const t=e.maxPages,o=e.searchResultsUrl,r=e.numPages;if(!e.resultFileName){const t=p.utc().toFormat("yyyy-MM-dd_HH:mm:ss");e.resultFileName=`jobsdb-${o.hostname.substring(0,2)}-${r}-${t}`}const s=`${e.resultFileName}.ndjson`;let n=u.join(e.saveDir,s);for(let e=1;e<=r;e++)G.enqueue(e);var i,a;i=n,W=g.createWriteStream(i,{flags:"a",flush:!0,highWaterMark:67108864}),a=W,B=v.queue((e,t)=>{try{const o=JSON.stringify(e)+"\n";a.write(o)?t():a.once("drain",()=>{a.write(o)?t():t(new Error("Write failed after drain"))})}catch(e){t(e)}},1),W.on("error",e=>{throw J.error(`WriteStream error: ${e.message}`),new Error(`WriteStream error: ${e.message}`)}),K=r>10?2:1;try{for(let e=0;e<K;e++){const t=c("node",[u.join(U,"src/cloudnode.js"),String(e)],{shell:!1});J.info(`Starting cloudnode ${e}...`),Y.push(t)}for(let e=0;e<K;e++)ee.push(await D(Y[e])),J.info(`Cloudnode ${e} started on port ${ee[e]}`);for(let e=0;e<K;e++)Z.push(new A(e,o,ee[e],B,J.child({module:`scrapeOp${e}`}),G,Q,V)),X.push(Z[e].__start__()),J.info(`Scrape operation ${e} initialized`);let s=!1;I(`Scraping ${r}/${t} available pages of jobs on ${o.href}.`),J.info(`Scraping ${r}/${t} available pages of jobs on ${o.href}.`);const i=Promise.all(X).finally(()=>{s=!0,G.isEmpty()||console.error("\nCouldn't complete scraping operation at this time, try again in ~1min, if still persists, please file an issue on github")});for(process.env.IS_LIBRARY_CALL&&await i;!s;)P(r-G.size(),r),await k(1e3);return I(),J.info("All scrape operations completed."),await async function(e){return new Promise(t=>{0===e.length()&&0===e.running()?t():e.drain(t)})}(B),J.info("Object queue drained."),await async function(e){return new Promise(t=>{e.once("finish",t),e.end()})}(W),J.info("Outstream closed."),"csv"===e.format&&(await async function(e,t){const o=e.replace(".ndjson",".csv"),r=g.createReadStream(e,{encoding:"utf-8"}),s=g.createWriteStream(o,{encoding:"utf-8"}),n={ndjson:!0,defaultValue:null,transforms:[j({objects:!0,arrays:!0,separator:"_"}),z],fields:[...t]},i=new S(n,{},{});await y(r,i,s)}(n,Q),$(n),n=n.replace(".ndjson",".csv")),I(`Scrape finished in ${Math.floor(Date.now()/1e3-te)} seconds`),I(`Result file saved to ./${n}`),J.info(`Result file saved to ./${n}`),J.info(`Scrape finished in ${Math.floor(Date.now()/1e3-te)} seconds`),{resultPath:n,scrape_stats:V}}catch(e){"EACCES"===e.code?(console.error("The specified result directory does not have write permissions."),J.error("The specified result directory does not have write permissions.")):(console.error("scrape_jobsdb.ts in main:",e),J.error(`Error during scraping: ${e.message}`))}finally{for(let e=0;e<K;e++)Y.length>0&&(J.info(`Shutting down CloudNode ${e} on port ${ee[e]}...`),!1===Y[e].kill()&&(console.error("Error during CloudNode shutdown"),J.error(`Error during CloudNode ${e} shutdown`)))}}export{oe as findMaxPages,se as main,re as scrapeJobsdb};
@@ -0,0 +1,51 @@
1
+ import { Logger } from 'pino';
2
+ import Hero from '@ulixee/hero';
3
+ import Queue from 'queue-fifo';
4
+ import { Sema } from 'async-sema';
5
+ import { QueueObject } from 'async';
6
+ import { ScrapeStats } from './types.js';
7
+
8
+ declare class Cookie {
9
+ cookie: {
10
+ [key: string]: string;
11
+ };
12
+ constructor(cookie?: {
13
+ [key: string]: string;
14
+ });
15
+ }
16
+ declare class PageArgs {
17
+ jobIds: string[];
18
+ url: URL;
19
+ cookie: Cookie;
20
+ constructor(url: URL, cookie?: Cookie, jobIds?: string[]);
21
+ }
22
+ declare class JobArgs {
23
+ pageArgs: PageArgs;
24
+ jobid: string;
25
+ constructor(pageArgs: PageArgs, jobid: string);
26
+ }
27
+ declare class ScrapeOperation {
28
+ id: number;
29
+ searchResultsUrl: URL;
30
+ cloudNodePort: number;
31
+ pageQueue: Queue<number>;
32
+ timeout: number;
33
+ logger: Logger;
34
+ scrape_stats: ScrapeStats;
35
+ topLevelFields: Set<string>;
36
+ outQueue: QueueObject<Object>;
37
+ timeoutPromise: Promise<unknown>;
38
+ timeoutClear: () => void;
39
+ constructor(id: number, searchResultsUrl: URL, cloudNodePort: number, outQueue: QueueObject<Object>, logger: Logger, pageQueue: Queue<number>, topLevelFields: Set<string>, scrape_stats: ScrapeStats, timeout?: number);
40
+ assemble_cookie(cookie: Cookie): string;
41
+ get_dict(cookies: any): {
42
+ [key: string]: string;
43
+ };
44
+ scrape_job_details(workerId: number, hero: Hero, userAgent: string, jobId: string, pageArgs: PageArgs, semaphore: Sema): Promise<any>;
45
+ scrape_page_job_details(workerId: number, hero: Hero, semaphore: Sema, userAgent: string, pageArgs: PageArgs): Promise<any>;
46
+ startWorker(workerId: number): Promise<void>;
47
+ scrape_all_jobs(): Promise<void>;
48
+ __start__(): Promise<void>;
49
+ }
50
+
51
+ export { Cookie, JobArgs, PageArgs, ScrapeOperation };
@@ -0,0 +1 @@
1
+ import e from"@ulixee/hero";import{compile as t}from"html-to-text";import"@ulixee/hero-core";import"@ulixee/net";import"@ulixee/hero-plugin-utils";import{v4 as o}from"uuid";import{Sema as s}from"async-sema";import{flatten as i}from"flat";var a=t({}),n=class{cookie;constructor(e={}){this.cookie=e}},r=class{jobIds;url;cookie;constructor(e,t=new n,o=[]){this.url=e,this.jobIds=o,this.cookie=t}},l=class{pageArgs;jobid;constructor(e,t){this.pageArgs=e,this.jobid=t}},c=class{id;searchResultsUrl;cloudNodePort;pageQueue;timeout;logger;scrape_stats;topLevelFields;outQueue;timeoutPromise;timeoutClear;constructor(e,t,o,s,i,a,n,r,l=3600){this.id=e,this.scrape_stats=r,this.searchResultsUrl=t,this.outQueue=s,this.logger=i,this.pageQueue=a,this.timeout=l,this.topLevelFields=n;const{promise:c,clear:u}=function(e,t){let o;return{promise:new Promise((s,i)=>{o=setTimeout(()=>i(t),1e3*e)}),clear:()=>clearTimeout(o)}}(l,"Timeout");this.timeoutPromise=c.then(()=>i.error(`Scrape op ${e} timed out.`)),this.timeoutClear=u,this.cloudNodePort=o}assemble_cookie(e){return Object.entries(e.cookie).map(([e,t])=>`${e}=${t}`).join("; ")}get_dict(e){const t={};for(let o=0;o<e.length;o++)t[e[o].name]=e[o].value;return t}async scrape_job_details(e,t,s,n,r,l){let c=0;let u=!1,d=r.cookie.cookie;const h={accept:"*/*","accept-language":"en-GB,en-US;q=0.9,en;q=0.8","content-type":"application/json",origin:"https://hk.jobsdb.com",Connection:"keep-alive",Cookie:this.assemble_cookie(r.cookie),priority:"u=1, i",referer:r.url.href,"sec-fetch-dest":"empty","sec-fetch-mode":"cors","sec-fetch-site":"same-origin","seek-request-brand":"jobsdb","seek-request-country":"HK","user-agent":s,"x-seek-ec-sessionid":d.JobseekerSessionId??"","x-seek-ec-visitorid":d.JobseekerVisitorId??"","x-seek-site":"chalice"},g={operationName:"jobDetails",variables:{jobId:n,jobDetailsViewedCorrelationId:o(),sessionId:d.JobseekerSessionId??"",zone:"asia-1",locale:"en-HK",languageCode:"en",countryCode:"HK",timezone:"America/New_York"},query:'query jobDetails($jobId: ID!, $jobDetailsViewedCorrelationId: String!, $sessionId: String!, $zone: Zone!, $locale: Locale!, $languageCode: LanguageCodeIso!, $countryCode: CountryCodeIso2!, $timezone: Timezone!) {\n jobDetails(\n id: $jobId\n tracking: {channel: "WEB", jobDetailsViewedCorrelationId: $jobDetailsViewedCorrelationId, sessionId: $sessionId}\n ) {\n job {\n sourceZone\n id\n title\n phoneNumber\n isExpired\n expiresAt {\n dateTimeUtc\n }\n isLinkOut\n contactMatches {\n type\n value\n }\n isVerified\n abstract\n content(platform: WEB)\n status\n listedAt {\n label(context: JOB_POSTED, length: SHORT, timezone: $timezone, locale: $locale)\n dateTimeUtc\n }\n salary {\n currencyLabel(zone: $zone)\n label\n }\n shareLink(platform: WEB, zone: $zone, locale: $locale)\n workTypes {\n label(locale: $locale)\n }\n advertiser {\n id\n name(locale: $locale)\n isVerified\n registrationDate {\n dateTimeUtc\n }\n }\n location {\n label(locale: $locale, type: LONG)\n }\n classifications {\n label(languageCode: $languageCode)\n }\n products {\n branding {\n id\n cover {\n url\n }\n thumbnailCover: cover(isThumbnail: true) {\n url\n }\n logo {\n url\n }\n }\n bullets\n questionnaire {\n questions\n }\n video {\n url\n position\n }\n }\n }\n companyProfile(zone: $zone) {\n id\n name\n companyNameSlug\n shouldDisplayReviews\n branding {\n logo\n }\n overview {\n description {\n paragraphs\n }\n industry\n size {\n description\n }\n website {\n url\n }\n }\n reviewsSummary {\n overallRating {\n numberOfReviews {\n value\n }\n value\n }\n }\n perksAndBenefits {\n title\n }\n }\n companySearchUrl(zone: $zone, languageCode: $languageCode)\n companyTags {\n key(languageCode: $languageCode)\n value\n }\n restrictedApplication(countryCode: $countryCode) {\n label(locale: $locale)\n }\n }\n}'};for(;!u&&c<3;)try{const o=await t.fetch("https://hk.jobsdb.com/graphql",{method:"POST",headers:h,body:JSON.stringify(g)});if(200!==await o.status)throw this.logger.error(`Hero ${this.id}.${e} encountered error status ${await o.status} on job fetch for p${r.url.searchParams.get("page")}`),new Error(`Hero ${this.id}.${e} encountered error status ${await o.status} on job fetch for p${r.url.searchParams.get("page")}`);const s=(await o.json()).data.jobDetails.job;Object.keys(i(s,{delimiter:"_"})).forEach(e=>{this.topLevelFields.has(e)||this.topLevelFields.add(e)}),s.content=a(s.content),this.outQueue.push(s,t=>{if(t)throw this.logger.error(`Hero ${this.id}.${e}'s queue push failed:, ${t.message}`),new Error(`Hero ${this.id}.${e}'s queue push failed:, ${t.message}`)}),u=!0,c++}catch(t){this.logger.error(`Hero ${this.id}.${e} failed to scrape job ${n} on page ${r.url.searchParams.get("page")} on attempt #${c}: ${t.toString()}`)}if(l.release(),3==c)throw this.logger.error(`Hero ${this.id}.${e} failed to scrape job ${n} on page ${r.url.searchParams.get("page")}`),new Error(`Hero ${this.id}.${e} failed to scrape job ${n} on page ${r.url.searchParams.get("page")}`)}async scrape_page_job_details(e,t,o,s,i){let a=[];for(let n of i.jobIds)await o.acquire(),a.push(this.scrape_job_details(e,t,s,n,i,o));await Promise.all(a)}async startWorker(t){const o=new e({noChromeSandbox:!0,blockedResourceTypes:["All"],connectionToCore:{host:`localhost:${this.cloudNodePort}`}});let i=0,a=0;const l=(await o.meta).userAgentString;this.logger.info(`Hero instance ${this.id}.${t} started`);const c=new s(8);try{for(;!this.pageQueue.isEmpty();){let e=[];const s=this.pageQueue.dequeue();this.searchResultsUrl.searchParams.set("page",String(s)),this.logger.info(`Hero ${this.id}.${t} dequeued page ${s}`),await o.goto(this.searchResultsUrl.href),await o.waitForLoad("DomContentLoaded");let u=o.querySelectorAll("article[data-job-id]"),d=await u.length;for(let t=0;t<d;t++){const o=u.item(t);await o.$waitForVisible({timeoutMs:1e4});const s=await o.getAttribute("data-job-id");e.push(s)}if(0===e.length)break;const h=new n(this.get_dict(await o.activeTab.cookieStorage.getItems())),g=new r(this.searchResultsUrl,h,e);await this.scrape_page_job_details(t,o,c,l,g),i++,a+=e.length,this.scrape_stats.totalPagesScraped++,this.scrape_stats.totalJobsScraped+=e.length,this.logger.info(`Hero ${this.id}.${t} successfully scraped page ${s} with ${e.length} jobs`)}this.logger.info(`Hero instance ${this.id}.${t} closed, scraped ${i} pages and ${a} jobs.`)}catch(e){throw this.logger.info(`Hero ${this.id}.${t} failed on ${await o.activeTab.url}`),e}finally{await o.close()}}async scrape_all_jobs(){const e=[];let t=Math.min(this.pageQueue.size(),10);for(let o=0;o<t;o++)e.push(this.startWorker(o));await Promise.all(e)}async __start__(){try{this.logger.info(`Starting scrape operation ${this.id}, using cloud node on port ${this.cloudNodePort}`);if("Timeout"===await Promise.race([this.scrape_all_jobs(),this.timeoutPromise]))throw new Error(`Scrape operation ${this.id} timed out after ${this.timeout} seconds.`)}catch(e){throw this.logger.error(`Scrape operation ${this.id} failed: ${e.message}`),e}finally{this.timeoutClear()}}};export{n as Cookie,l as JobArgs,r as PageArgs,c as ScrapeOperation};
@@ -0,0 +1,8 @@
1
+ import * as html_to_text from 'html-to-text';
2
+ import Hero from '@ulixee/hero';
3
+
4
+ declare const parseHtml: html_to_text.compiledFunction;
5
+ declare function isZeroResults(hero: Hero, url: string): Promise<boolean>;
6
+ declare function findLastPage(searchResultsUrl: URL, hero?: Hero): Promise<number>;
7
+
8
+ export { findLastPage, isZeroResults, parseHtml };
@@ -0,0 +1 @@
1
+ import{compile as e}from"html-to-text";import o from"@ulixee/hero-core";import{TransportBridge as t}from"@ulixee/net";import r,{ConnectionToHeroCore as a}from"@ulixee/hero";import{CorePlugin as n}from"@ulixee/hero-plugin-utils";var i=class extends n{static id="NoSandboxPlugin";onNewBrowser(e,o){this.browserEngine.launchArguments.push("--no-sandbox","--disable-setuid-sandbox")}},s=e({});async function l(e,o){await e.goto(o),await e.waitForLoad("DomContentLoaded");const t=e.querySelector('section[data-automation="search-zero-results"], article[data-job-id]');try{await t.$waitForVisible({timeoutMs:1e4})}catch(e){throw new Error(`Fatal Error: Unable to determine whether ${o} is has search results`)}return!t||"SECTION"===await t.tagName}async function c(e,n){let s;((...e)=>{process.env.IS_LIBRARY_CALL||console.log(...e)})("Finding number of pages available to scrape on %s",e.href);let c=!1;try{if(void 0===n){c=!0;const e=new t,l=new a(e.transportToCore);s=new o({disableSessionPersistence:!0}),s.use(i),s.addConnection(e.transportToClient),n=new r({noChromeSandbox:!0,blockedResourceTypes:["All"],showChrome:!1,connectionToCore:l})}let h=1,u=1e3,w=new URL(e.href);for(;h<=u;){let e=Math.trunc((h+u)/2);w.searchParams.set("page",e.toString()),await l(n,w.href)?u=e-1:h=e+1}return h-1}catch(e){throw e}finally{c&&(await n.close(),await s.close())}}export{c as findLastPage,l as isZeroResults,s as parseHtml};
@@ -0,0 +1,3 @@
1
+ declare function waitForPort(process: any): Promise<number>;
2
+
3
+ export { waitForPort };
@@ -0,0 +1 @@
1
+ function t(t){return new Promise((n,o)=>{t.stdout?.once("data",t=>{try{const o=parseInt(t.toString());n(o)}catch(t){o(t)}})})}export{t as waitForPort};
@@ -0,0 +1,16 @@
1
+ type ScrapeStats = {
2
+ totalJobsScraped: number;
3
+ totalPagesScraped: number;
4
+ };
5
+ declare class ScrapeOptions {
6
+ searchResultsUrlString: string;
7
+ searchResultsUrl?: URL;
8
+ resultFileName: string | undefined;
9
+ numPages: number | 'all';
10
+ saveDir: string;
11
+ format: 'ndjson' | 'csv';
12
+ maxPages?: number;
13
+ constructor(searchResultsUrlString: string, numPages?: number | 'all', saveDir?: string, format?: 'ndjson' | 'csv', resultFileName?: string);
14
+ }
15
+
16
+ export { ScrapeOptions, type ScrapeStats };
@@ -0,0 +1 @@
1
+ var s=class{searchResultsUrlString;searchResultsUrl;resultFileName;numPages;saveDir;format;maxPages;constructor(s,r="all",e="./jobsdb_scraper_results",t="ndjson",a){this.searchResultsUrlString=s,this.numPages=r,this.saveDir=e,this.format=t,this.resultFileName=a}};export{s as ScrapeOptions};
@@ -0,0 +1,13 @@
1
+ declare function printProgressBar(completed: number, total: number, barLength?: number): void;
2
+ declare function sleep(ms: number): Promise<unknown>;
3
+ declare function reverseString(str: string): string;
4
+ declare function getObjectSize(obj: {
5
+ [key: string]: any;
6
+ }): number;
7
+ declare const print: (...args: any[]) => void;
8
+ declare function createTimeoutPromise(timeout: number, timeoutVal: any): {
9
+ promise: Promise<unknown>;
10
+ clear: () => void;
11
+ };
12
+
13
+ export { createTimeoutPromise, getObjectSize, print, printProgressBar, reverseString, sleep };
@@ -0,0 +1 @@
1
+ function e(e,t,o=40){if(e===t)return;const r=Math.min(e/t,1),s=Math.round(r*o),n="█".repeat(s)+"-".repeat(o-s),i=(100*r).toFixed(2);process.stdout.isTTY?(process.stdout.clearLine(0),process.stdout.cursorTo(0),process.stdout.write(`Progress: [${n}] ${i}%`),e>=t&&process.stdout.write("\n")):console.log(`Progress: [${n}] ${i}%`)}function t(e){return new Promise(t=>setTimeout(t,e))}function o(e){return e.split("").reverse().join("")}function r(e){return Object.keys(e).length}var s=(...e)=>{process.env.IS_LIBRARY_CALL||console.log(...e)};function n(e,t){let o;return{promise:new Promise((r,s)=>{o=setTimeout(()=>s(t),1e3*e)}),clear:()=>clearTimeout(o)}}export{n as createTimeoutPromise,r as getObjectSize,s as print,e as printProgressBar,o as reverseString,t as sleep};
package/package.json ADDED
@@ -0,0 +1,77 @@
1
+ {
2
+ "name": "jobsdb-scraper",
3
+ "version": "1.0.3",
4
+ "description": "A tool for automatically collecting job information from JobsDB",
5
+ "main": "dist/src/scrape_jobsdb.js",
6
+ "bin": {
7
+ "jobsdb-scraper": "dist/src/cli.js"
8
+ },
9
+ "type": "module",
10
+ "files": [
11
+ "dist",
12
+ "README.md",
13
+ "sample_output.ndjson",
14
+ "assets",
15
+ "LICENSE"
16
+ ],
17
+ "author": "Krish Galani (krishdgala@gmail.com)",
18
+ "license": "PROPRIETARY",
19
+ "bugs": {
20
+ "url": "https://github.com/krishgalani/jobsdb-scraper/issues"
21
+ },
22
+ "repository": {
23
+ "type": "git",
24
+ "url": "git+https://github.com/krishgalani/jobsdb-scraper.git"
25
+ },
26
+ "dependencies": {
27
+ "@json2csv/node": "^7.0.6",
28
+ "@json2csv/transforms": "^7.0.6",
29
+ "@ulixee/chrome-139-0": "^7258.155.11",
30
+ "@ulixee/cloud": "^2.0.0-alpha.34",
31
+ "@ulixee/hero": "^2.0.0-alpha.34",
32
+ "async": "^3.2.6",
33
+ "async-sema": "^3.1.1",
34
+ "commander": "^14.0.2",
35
+ "flat": "^6.0.1",
36
+ "html-to-text": "^9.0.5",
37
+ "luxon": "^3.7.2",
38
+ "pino": "^10.1.0",
39
+ "queue-fifo": "^0.2.6",
40
+ "tslib": "^2.8.1",
41
+ "uuid": "^13.0.0",
42
+ "valid-filename": "^4.0.0"
43
+ },
44
+ "keywords": [
45
+ "cli",
46
+ "scraper",
47
+ "jobsdb",
48
+ "hero",
49
+ "typescript"
50
+ ],
51
+ "devDependencies": {
52
+ "@types/async": "^3.2.25",
53
+ "@types/html-to-text": "^9.0.4",
54
+ "@types/luxon": "^3.7.1",
55
+ "@types/node": "^25.0.2",
56
+ "@types/uuid": "^11.0.0",
57
+ "pino-pretty": "^13.1.3",
58
+ "rimraf": "^6.1.2",
59
+ "terser": "^5.44.1",
60
+ "ts-node": "^10.9.2",
61
+ "tsup": "^8.5.1",
62
+ "typescript": "^5.9.3",
63
+ "vitest": "^4.0.15"
64
+ },
65
+ "engines": {
66
+ "node": ">=18.0.0 <=22",
67
+ "npm": ">=8.0.0"
68
+ },
69
+ "scripts": {
70
+ "prepare": "npm run build",
71
+ "build": "tsup-node",
72
+ "scraper": "ts-node src/cli",
73
+ "clear": "rimraf build jobsdb_scrape_logs jobsdb_scrape_results",
74
+ "test": "vitest run --reporter verbose",
75
+ "prettify-logs": "sh prettify-logs.sh jobsdb_scrape_logs"
76
+ }
77
+ }