@koalarx/scrapping 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/@types/browser-config.d.ts +8 -0
  2. package/@types/browser-config.js +2 -0
  3. package/@types/dom-options.d.ts +4 -0
  4. package/@types/dom-options.js +2 -0
  5. package/@types/get-datatable-options.d.ts +9 -0
  6. package/@types/get-datatable-options.js +2 -0
  7. package/Browser.d.ts +11 -0
  8. package/Browser.js +65 -0
  9. package/Dom.d.ts +21 -0
  10. package/Dom.js +103 -0
  11. package/Frame.d.ts +5 -0
  12. package/Frame.js +10 -0
  13. package/Page.d.ts +8 -0
  14. package/Page.js +43 -0
  15. package/constants/args.d.ts +2 -0
  16. package/constants/args.js +42 -0
  17. package/package.json +7 -3
  18. package/.github/workflows/npm-publish.yml +0 -32
  19. package/.prettierrc.json +0 -9
  20. package/.vscode/launch.json +0 -13
  21. package/.vscode/mcp.json +0 -10
  22. package/.vscode/settings.json +0 -151
  23. package/.vscode/tasks.json +0 -17
  24. package/LICENSE +0 -21
  25. package/bun.lock +0 -749
  26. package/bunfig.toml +0 -7
  27. package/eslint.config.mts +0 -83
  28. package/lib/core/@types/browser-config.ts +0 -8
  29. package/lib/core/@types/dom-options.ts +0 -4
  30. package/lib/core/@types/get-datatable-options.ts +0 -9
  31. package/lib/core/Browser.ts +0 -81
  32. package/lib/core/Dom.ts +0 -144
  33. package/lib/core/Frame.ts +0 -8
  34. package/lib/core/Page.ts +0 -56
  35. package/lib/core/constants/args.ts +0 -40
  36. package/lib/test/download-file.spec.ts +0 -18
  37. package/lib/test/frame-interaction.spec.ts +0 -22
  38. package/lib/test/get-datatable-with-paginator.spec.ts +0 -31
  39. package/lib/test/search-wikipidea.spec.ts +0 -18
  40. package/lib/test/setup.ts +0 -14
  41. package/lib/test/vars.ts +0 -7
  42. package/scripts/README.md +0 -209
  43. package/scripts/create-version-tag.ps1 +0 -83
  44. package/scripts/create-version-tag.sh +0 -91
  45. package/scripts/hooks/post-commit +0 -118
  46. package/scripts/hooks/post-commit-windows.bat +0 -80
  47. package/scripts/hooks/post-commit.bat +0 -15
  48. package/scripts/hooks/post-commit.ps1 +0 -101
  49. package/scripts/hooks/post-merge +0 -66
  50. package/scripts/hooks/pre-commit +0 -82
  51. package/scripts/hooks/pre-commit-windows.bat +0 -68
  52. package/scripts/hooks/pre-commit.bat +0 -15
  53. package/scripts/setup-hooks.bat +0 -107
  54. package/scripts/setup-hooks.ps1 +0 -69
  55. package/scripts/setup-hooks.sh +0 -44
  56. package/scripts/test-linux-support.sh +0 -171
  57. package/scripts/version-dialog.ps1 +0 -137
  58. package/scripts/version-dialog.sh +0 -139
  59. package/tsconfig.build.json +0 -31
  60. package/tsconfig.json +0 -40
package/bunfig.toml DELETED
@@ -1,7 +0,0 @@
1
- [test]
2
- root = "lib/test"
3
- timeoutMs = 60000
4
- preload = ["./lib/test/setup.ts"]
5
-
6
- [loader]
7
- ".ts" = "tsx"
package/eslint.config.mts DELETED
@@ -1,83 +0,0 @@
1
- import js from '@eslint/js'
2
- import vitest from '@vitest/eslint-plugin'
3
- import globals from 'globals'
4
- import tseslint from 'typescript-eslint'
5
- import prettierConfig from 'eslint-config-prettier'
6
- import prettierPlugin from 'eslint-plugin-prettier'
7
-
8
- export default [
9
- {
10
- ignores: [
11
- '**/*.json',
12
- 'node_modules',
13
- 'dist',
14
- 'coverage',
15
- 'prisma/generated',
16
- '**/*.yaml',
17
- ],
18
- },
19
- js.configs.recommended,
20
- ...tseslint.configs.recommended,
21
- prettierConfig,
22
- {
23
- files: ['src/**/*.ts'],
24
- languageOptions: {
25
- parser: tseslint.parser,
26
- parserOptions: {
27
- ecmaVersion: 'latest',
28
- sourceType: 'module',
29
- },
30
- globals: globals.node,
31
- },
32
- plugins: {
33
- '@typescript-eslint': tseslint.plugin,
34
- prettier: prettierPlugin,
35
- },
36
- rules: {
37
- 'prettier/prettier': [
38
- 'warn',
39
- {
40
- printWidth: 80,
41
- tabWidth: 2,
42
- singleQuote: true,
43
- trailingComma: 'all',
44
- arrowParens: 'always',
45
- semi: false,
46
- endOfLine: 'lf',
47
- },
48
- ],
49
- 'no-unused-vars': 'off',
50
- 'no-useless-constructor': 'off',
51
- 'no-new': 'off',
52
- 'no-use-before-define': 'off',
53
- '@typescript-eslint/no-unused-vars': 'warn',
54
- '@typescript-eslint/no-explicit-any': 'off',
55
- '@typescript-eslint/no-unsafe-function-type': 'off',
56
- '@typescript-eslint/ban-ts-comment': 'off',
57
- '@typescript-eslint/no-unsafe-assignment': 'off',
58
- '@typescript-eslint/no-unsafe-member-access': 'off',
59
- '@typescript-eslint/no-unsafe-call': 'off',
60
- '@typescript-eslint/no-require-imports': 'off',
61
- 'lines-between-class-members': [
62
- 'warn',
63
- 'always',
64
- { exceptAfterSingleLine: true },
65
- ],
66
- },
67
- },
68
- {
69
- files: ['src/**/*.spec.ts', 'src/test/setup-e2e.ts'],
70
- languageOptions: {
71
- globals: {
72
- ...vitest.environments.env.globals,
73
- },
74
- },
75
- plugins: {
76
- vitest,
77
- },
78
- rules: {
79
- ...vitest.configs.recommended.rules,
80
- 'vitest/no-conditional-expect': 'off',
81
- },
82
- },
83
- ]
@@ -1,8 +0,0 @@
1
- export interface BrowserConfig {
2
- headless?: boolean
3
- proxy?: string
4
- minimalist?: boolean
5
- slowMo?: number
6
- downloadFolderPath?: string
7
- screenshotFolderPath?: string
8
- }
@@ -1,4 +0,0 @@
1
- export interface DOMOptions {
2
- downloadFolderPath?: string
3
- screenshotFolderPath?: string
4
- }
@@ -1,9 +0,0 @@
1
- export interface GetDatatableOptions {
2
- withPagination?: {
3
- nextButtonSelector: string
4
- }
5
- infiniteScroll?: {
6
- scrollContainerSelector: string
7
- }
8
- limit?: number
9
- }
@@ -1,81 +0,0 @@
1
- import puppeteer, { Browser as PuppeteerBrowser } from 'puppeteer'
2
- import type { BrowserConfig } from './@types/browser-config'
3
- import { BROWSER_ARGS } from './constants/args'
4
- import { Page } from './Page'
5
- import { existsSync, mkdirSync } from 'node:fs'
6
-
7
- export class Browser {
8
- private _browser?: PuppeteerBrowser
9
- private _page?: Page
10
-
11
- constructor(private config: BrowserConfig) {}
12
-
13
- get page() {
14
- if (!this._page) {
15
- throw new Error('DOM is not lauched. Certificate you call init method.')
16
- }
17
-
18
- return this._page
19
- }
20
-
21
- async init() {
22
- const { headless, proxy, minimalist, slowMo } = this.config
23
-
24
- const downloadPath = this.config.downloadFolderPath ?? './downloads'
25
-
26
- if (!existsSync(downloadPath)) {
27
- mkdirSync(downloadPath)
28
- }
29
-
30
- const args = BROWSER_ARGS.concat(
31
- proxy ? [`--proxy-server=${proxy}`] : [],
32
- ).concat(headless ? [] : ['--start-maximized'])
33
-
34
- const browser = await puppeteer.launch({
35
- args,
36
- headless,
37
- defaultViewport: null,
38
- slowMo,
39
- downloadBehavior: {
40
- policy: 'allow',
41
- downloadPath,
42
- },
43
- })
44
-
45
- const page = await browser.pages().then((pages) => pages[0]!)
46
-
47
- if (minimalist) {
48
- page.removeAllListeners('request')
49
-
50
- await page.setRequestInterception(true)
51
-
52
- page.on('request', (req) => {
53
- if (
54
- req.resourceType() === 'stylesheet' ||
55
- req.resourceType() === 'font' ||
56
- req.resourceType() === 'image'
57
- ) {
58
- req.abort()
59
- } else {
60
- req.continue()
61
- }
62
- })
63
- }
64
-
65
- this._browser = browser
66
- this._page = new Page(page)
67
-
68
- return this
69
- }
70
-
71
- async close() {
72
- if (!this._browser) {
73
- throw new Error(
74
- 'Browser is not lauched. Certificate you call init method.',
75
- )
76
- }
77
-
78
- await this._page.close()
79
- await this._browser.close()
80
- }
81
- }
package/lib/core/Dom.ts DELETED
@@ -1,144 +0,0 @@
1
- import { delay, KlDate, toCamelCase } from '@koalarx/utils'
2
- import htmlTableToJson from 'html-table-to-json'
3
- import { existsSync, mkdirSync, readdirSync, readFileSync } from 'node:fs'
4
- import path from 'node:path'
5
- import type { KeyInput, Page } from 'puppeteer'
6
- import { Frame as PuppeteerFrame } from 'puppeteer'
7
- import type { DOMOptions } from './@types/dom-options'
8
- import type { GetDatatableOptions } from './@types/get-datatable-options'
9
-
10
- export class DOM {
11
- constructor(
12
- private readonly _page: Page | PuppeteerFrame,
13
- private readonly _options?: DOMOptions,
14
- ) {}
15
-
16
- protected get page() {
17
- return this._page instanceof PuppeteerFrame ? this._page.page() : this._page
18
- }
19
-
20
- async close() {
21
- await this.page.close()
22
- }
23
-
24
- async screenshot() {
25
- const folderPath = this._options?.screenshotFolderPath ?? './screenshots'
26
-
27
- if (!existsSync(folderPath)) {
28
- mkdirSync(folderPath)
29
- }
30
-
31
- const screenshotPath = path.join(
32
- folderPath,
33
- `${new KlDate().format('ddMMyyyyHHmmss')}.jpg`,
34
- )
35
-
36
- await this.page.screenshot({ path: screenshotPath })
37
- }
38
-
39
- async pressKey(key: KeyInput, combine?: KeyInput) {
40
- if (combine) {
41
- await this.page.keyboard.down(combine)
42
- }
43
-
44
- await this.page.keyboard.press(key)
45
- }
46
-
47
- async goTo(url: string) {
48
- await this._page.goto(url, { waitUntil: 'networkidle2' })
49
- }
50
-
51
- async fill(selector: string, value: string) {
52
- await this._page.locator(selector).scroll()
53
- await this._page.focus(selector)
54
- await this._page.locator(selector).fill(value)
55
- }
56
-
57
- async click(selector: string) {
58
- await this._page.locator(selector).scroll()
59
- await this._page.click(selector)
60
- }
61
-
62
- async focus(selector: string) {
63
- await this._page.locator(selector).scroll()
64
- await this._page.focus(selector)
65
- }
66
-
67
- async content(selector: string) {
68
- await this._page.locator(selector).scroll()
69
- return this._page.$$eval(selector, (elements) => {
70
- return elements.map(
71
- (el) => (el as HTMLElement).innerText.trim() as string,
72
- )
73
- })
74
- }
75
-
76
- async getDatatable<T = any>(
77
- selector: string,
78
- options?: GetDatatableOptions,
79
- ): Promise<T[]> {
80
- const table = await this._page.$eval(selector, (table) => table.outerHTML)
81
- const tableData = htmlTableToJson.parse(table).results[0]
82
- const result: T[] = tableData.map((row) => {
83
- const rowData: Record<string, string> = {}
84
-
85
- Object.keys(row).forEach((key) => {
86
- const value = row[key]
87
- const numberValue = Number(value)
88
- rowData[toCamelCase(key)] = isNaN(numberValue) ? value : numberValue
89
- })
90
-
91
- return rowData as T
92
- })
93
-
94
- if (options?.withPagination) {
95
- const nextButtonSelector = options.withPagination.nextButtonSelector
96
- const nextButton = await this._page.$(nextButtonSelector)
97
-
98
- if (
99
- nextButton &&
100
- !(await nextButton.evaluate(
101
- (btn) =>
102
- btn.classList.contains('disabled') || btn.hasAttribute('disabled'),
103
- ))
104
- ) {
105
- await nextButton.click()
106
- const nextPageData = await this.getDatatable(selector, options)
107
- result.push(...nextPageData)
108
- }
109
- }
110
-
111
- return result
112
- }
113
-
114
- async waitNavigation() {
115
- await this._page
116
- .waitForNavigation({ waitUntil: 'domcontentloaded', timeout: 5000 })
117
- .catch(() => null)
118
- }
119
-
120
- async getDownloadedFiles() {
121
- const downloadPath = this._options?.downloadFolderPath ?? './downloads'
122
-
123
- if (existsSync(downloadPath)) {
124
- let contentDir = []
125
-
126
- do {
127
- await delay(1000)
128
- contentDir = readdirSync(downloadPath)
129
- } while (
130
- contentDir.filter((filepath) => filepath.indexOf('.crdownload') >= 0)
131
- .length > 0
132
- )
133
-
134
- const files: Buffer[] = []
135
-
136
- contentDir.forEach((filepath) =>
137
- files.push(readFileSync(`${downloadPath}/${filepath}`)),
138
- )
139
- return files
140
- }
141
-
142
- return []
143
- }
144
- }
package/lib/core/Frame.ts DELETED
@@ -1,8 +0,0 @@
1
- import { Frame as PuppeteerFrame } from 'puppeteer'
2
- import { DOM } from './Dom'
3
-
4
- export class Frame extends DOM {
5
- constructor(frame: PuppeteerFrame) {
6
- super(frame)
7
- }
8
- }
package/lib/core/Page.ts DELETED
@@ -1,56 +0,0 @@
1
- import type { Frame as PuppeteerFrame, Page as PuppeteerPage } from 'puppeteer'
2
- import { DOM } from './Dom'
3
- import { Frame } from './Frame'
4
-
5
- export class Page extends DOM {
6
- constructor(page: PuppeteerPage) {
7
- super(page)
8
- }
9
-
10
- async getFrameByURL(url: string) {
11
- const frames = this.page.frames()
12
- const frame = frames.find((frame) => frame.url() === url)
13
-
14
- if (!frame) {
15
- throw new Error(`Frame with URL "${url}" not found`)
16
- }
17
-
18
- return new Frame(frame)
19
- }
20
-
21
- async getFrameByName(name: string) {
22
- const frames = this.page.frames()
23
-
24
- let frame: PuppeteerFrame | undefined
25
-
26
- for (const f of frames) {
27
- const frameElement = await f.frameElement()
28
-
29
- if (!frameElement) {
30
- continue
31
- }
32
-
33
- const frameName = await frameElement.evaluate((el) =>
34
- el.getAttribute('name'),
35
- )
36
-
37
- if (frameName === name) {
38
- frame = f
39
- break
40
- }
41
-
42
- const frameId = await frameElement.evaluate((el) => el.getAttribute('id'))
43
-
44
- if (frameId === name) {
45
- frame = f
46
- break
47
- }
48
- }
49
-
50
- if (!frame) {
51
- throw new Error(`Frame with name "${name}" not found`)
52
- }
53
-
54
- return new Frame(frame)
55
- }
56
- }
@@ -1,40 +0,0 @@
1
- import { KlArray } from '@koalarx/utils'
2
-
3
- export const BROWSER_ARGS = new KlArray([
4
- '--autoplay-policy=user-gesture-required',
5
- '--disable-background-networking',
6
- '--disable-background-timer-throttling',
7
- '--disable-backgrounding-occluded-windows',
8
- '--disable-breakpad',
9
- '--disable-client-side-phishing-detection',
10
- '--disable-component-update',
11
- '--disable-default-apps',
12
- '--disable-dev-shm-usage',
13
- '--disable-domain-reliability',
14
- '--disable-features=AudioServiceOutOfProcess',
15
- '--disable-hang-monitor',
16
- '--disable-ipc-flooding-protection',
17
- '--disable-notifications',
18
- '--disable-offer-store-unmasked-wallet-cards',
19
- '--disable-popup-blocking',
20
- '--disable-print-preview',
21
- '--disable-prompt-on-repost',
22
- '--disable-renderer-backgrounding',
23
- '--disable-setuid-sandbox',
24
- '--disable-speech-api',
25
- '--disable-sync',
26
- '--hide-scrollbars',
27
- '--ignore-gpu-blacklist',
28
- '--metrics-recording-only',
29
- '--mute-audio',
30
- '--no-default-browser-check',
31
- '--no-first-run',
32
- '--no-pings',
33
- '--no-sandbox',
34
- '--no-zygote',
35
- '--password-store=basic',
36
- '--use-gl=swiftshader',
37
- '--use-mock-keychain',
38
- '--disable-setuid-sandbox',
39
- '-wait-for-browser',
40
- ])
@@ -1,18 +0,0 @@
1
- import { rmSync } from 'node:fs'
2
- import { TestVars } from './vars'
3
-
4
- test('Test RPA download file', async () => {
5
- const page = TestVars.page
6
-
7
- await page.goTo('https://proof.ovh.net/files/')
8
- await page.click(
9
- '#main > table > tbody > tr:nth-child(2) > td:nth-child(1) > a',
10
- )
11
-
12
- const files = await page.getDownloadedFiles()
13
-
14
- expect(files.length).toBeGreaterThan(0)
15
- expect(files[0]).toBeInstanceOf(Buffer)
16
-
17
- rmSync('downloads', { recursive: true, force: true })
18
- }, 15000)
@@ -1,22 +0,0 @@
1
- import { TestVars } from './vars'
2
-
3
- test('Frame Interaction', async () => {
4
- const page = TestVars.page
5
- const urlFrame = 'https://pt.wikipedia.org'
6
-
7
- await page.goTo(`https://iframetester.com/?url=${urlFrame}`)
8
-
9
- const frame = await page.getFrameByName('iframe-window')
10
-
11
- await frame.click('#p-search > a')
12
- await frame.fill(
13
- '#searchform > div > div > div.cdx-text-input.cdx-text-input--has-start-icon.cdx-text-input--status-default.cdx-search-input__text-input > input',
14
- 'Einstein',
15
- )
16
- await frame.pressKey('Enter')
17
-
18
- const content = await frame.content('#mw-content-text p')
19
-
20
- expect(content).toBeInstanceOf(Array)
21
- expect(content.length > 0).toBeTruthy()
22
- })
@@ -1,31 +0,0 @@
1
- import { TestVars } from './vars'
2
-
3
- interface DatatableItem {
4
- name: string
5
- position: string
6
- office: string
7
- age: string
8
- startDate: string
9
- salary: string
10
- }
11
-
12
- test('Test RPA get datatable with paginator', async () => {
13
- const page = TestVars.page
14
-
15
- await page.goTo('https://datatables.net')
16
- const data = await page.getDatatable<DatatableItem>('#example', {
17
- withPagination: {
18
- nextButtonSelector:
19
- '#example_wrapper > div:nth-child(3) > div.dt-layout-cell.dt-layout-end > div > nav > button.dt-paging-button.next',
20
- },
21
- })
22
-
23
- expect(data).toBeInstanceOf(Array)
24
- expect(data.length).toBeGreaterThan(10)
25
- expect(data[0]).toHaveProperty('name')
26
- expect(data[0]).toHaveProperty('position')
27
- expect(data[0]).toHaveProperty('office')
28
- expect(data[0]).toHaveProperty('age')
29
- expect(data[0]).toHaveProperty('startDate')
30
- expect(data[0]).toHaveProperty('salary')
31
- })
@@ -1,18 +0,0 @@
1
- import { TestVars } from './vars'
2
-
3
- test('Test RPA search on wikipedia', async () => {
4
- const page = TestVars.page
5
-
6
- await page.goTo('https://pt.wikipedia.org')
7
- await page.click('#searchInput')
8
- await page.fill(
9
- '#searchform > div > div > div.cdx-text-input.cdx-text-input--has-start-icon.cdx-text-input--status-default.cdx-search-input__text-input > input',
10
- 'Einstein',
11
- )
12
- await page.pressKey('Enter')
13
-
14
- const content = await page.content('#mw-content-text p')
15
-
16
- expect(content).toBeInstanceOf(Array)
17
- expect(content.length > 0).toBeTruthy()
18
- })
package/lib/test/setup.ts DELETED
@@ -1,14 +0,0 @@
1
- import { Browser } from '../core/Browser'
2
- import { TestVars } from './vars'
3
-
4
- beforeAll(async () => {
5
- TestVars.browser = await new Browser({
6
- minimalist: false,
7
- headless: false,
8
- }).init()
9
- TestVars.page = TestVars.browser.page
10
- })
11
-
12
- afterAll(async () => {
13
- await TestVars.browser.close()
14
- })
package/lib/test/vars.ts DELETED
@@ -1,7 +0,0 @@
1
- import type { Browser } from '../core/Browser'
2
- import type { Page } from '../core/Page'
3
-
4
- export class TestVars {
5
- static browser: Browser
6
- static page: Page
7
- }