@wargas/crawler 0.0.3 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +100 -6
- package/bun.lock +5 -2
- package/dist/index.js +3 -1
- package/index.d.ts +15 -0
- package/index.test.ts +1 -1
- package/index.ts +4 -4
- package/package.json +8 -3
package/README.md
CHANGED
|
@@ -1,15 +1,109 @@
|
|
|
1
|
-
#
|
|
1
|
+
# Crawler
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Classe utilitária para realizar requisições HTTP com:
|
|
4
|
+
|
|
5
|
+
* Persistência automática de cookies
|
|
6
|
+
* Parsing automático de HTML
|
|
7
|
+
* Manipulação do DOM usando `linkedom`
|
|
8
|
+
* Cliente HTTP baseado em got
|
|
9
|
+
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
# Instalação
|
|
4
13
|
|
|
5
14
|
```bash
|
|
6
|
-
bun
|
|
15
|
+
bun add @wargas/crawler
|
|
7
16
|
```
|
|
8
17
|
|
|
9
|
-
|
|
18
|
+
ou usando npm:
|
|
10
19
|
|
|
11
20
|
```bash
|
|
12
|
-
|
|
21
|
+
npm install @wargas/crawler
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
# Uso básico
|
|
27
|
+
|
|
28
|
+
```ts
|
|
29
|
+
import { Crawler } from "@wargas/crawler";
|
|
30
|
+
|
|
31
|
+
const crawler = Crawler.factory();
|
|
32
|
+
|
|
33
|
+
await crawler.client.get("https://example.com");
|
|
34
|
+
|
|
35
|
+
console.log(crawler.html);
|
|
36
|
+
|
|
37
|
+
console.log(
|
|
38
|
+
crawler.document.querySelector("title")?.textContent
|
|
39
|
+
);
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
---
|
|
43
|
+
|
|
44
|
+
# Cookies persistentes
|
|
45
|
+
|
|
46
|
+
Os cookies são armazenados automaticamente no arquivo:
|
|
47
|
+
|
|
48
|
+
```txt
|
|
49
|
+
cookies.json
|
|
13
50
|
```
|
|
14
51
|
|
|
15
|
-
|
|
52
|
+
Isso permite manter sessão entre execuções.
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
# Limpar cookies
|
|
57
|
+
|
|
58
|
+
```ts
|
|
59
|
+
await crawler.removeAllCookies();
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
# Acessando o DOM
|
|
65
|
+
|
|
66
|
+
Como o HTML é convertido automaticamente usando `linkedom`, é possível utilizar APIs similares ao navegador:
|
|
67
|
+
|
|
68
|
+
```ts
|
|
69
|
+
const links = crawler.document.querySelectorAll("a");
|
|
70
|
+
|
|
71
|
+
for (const link of links) {
|
|
72
|
+
console.log(link.getAttribute("href"));
|
|
73
|
+
}
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
# Configurações atuais
|
|
79
|
+
|
|
80
|
+
A instância do `got` é criada com:
|
|
81
|
+
|
|
82
|
+
```ts
|
|
83
|
+
followRedirect: false
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
Ou seja:
|
|
87
|
+
|
|
88
|
+
* redirects não são seguidos automaticamente
|
|
89
|
+
* cookies são persistidos
|
|
90
|
+
* HTML é parseado automaticamente após cada resposta
|
|
91
|
+
|
|
92
|
+
---
|
|
93
|
+
|
|
94
|
+
# Possíveis melhorias
|
|
95
|
+
|
|
96
|
+
* Suporte a proxy
|
|
97
|
+
* Retry automático
|
|
98
|
+
* Timeout configurável
|
|
99
|
+
* User-Agent customizado
|
|
100
|
+
* Suporte a certificados digitais
|
|
101
|
+
* Suporte a HTTP2
|
|
102
|
+
* Métodos helper (`get`, `post`, `login`, etc.)
|
|
103
|
+
* Cache de páginas
|
|
104
|
+
|
|
105
|
+
---
|
|
106
|
+
|
|
107
|
+
# Licença
|
|
108
|
+
|
|
109
|
+
MIT
|
package/bun.lock
CHANGED
|
@@ -12,9 +12,10 @@
|
|
|
12
12
|
},
|
|
13
13
|
"devDependencies": {
|
|
14
14
|
"@types/bun": "latest",
|
|
15
|
+
"tsc": "^2.0.4",
|
|
15
16
|
},
|
|
16
17
|
"peerDependencies": {
|
|
17
|
-
"typescript": "^
|
|
18
|
+
"typescript": "^6.0.3",
|
|
18
19
|
},
|
|
19
20
|
},
|
|
20
21
|
},
|
|
@@ -103,9 +104,11 @@
|
|
|
103
104
|
|
|
104
105
|
"tough-cookie-file-store": ["tough-cookie-file-store@3.3.0", "", { "dependencies": { "tough-cookie": "^6.0.0" } }, "sha512-FbO/cOi/jp4wweo8soVNG/ZjDsgpBZWqaxWwu7gRKvsjg/Qt44kStp87VLfJnin749DlTbZDYvV1wuSr5jly2g=="],
|
|
105
106
|
|
|
107
|
+
"tsc": ["tsc@2.0.4", "", { "bin": { "tsc": "bin/tsc" } }, "sha512-fzoSieZI5KKJVBYGvwbVZs/J5za84f2lSTLPYf6AGiIf43tZ3GNrI1QzTLcjtyDDP4aLxd46RTZq1nQxe7+k5Q=="],
|
|
108
|
+
|
|
106
109
|
"type-fest": ["type-fest@5.6.0", "", { "dependencies": { "tagged-tag": "^1.0.0" } }, "sha512-8ZiHFm91orbSAe2PSAiSVBVko18pbhbiB3U9GglSzF/zCGkR+rxpHx6sEMCUm4kxY4LjDIUGgCfUMtwfZfjfUA=="],
|
|
107
110
|
|
|
108
|
-
"typescript": ["typescript@
|
|
111
|
+
"typescript": ["typescript@6.0.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-y2TvuxSZPDyQakkFRPZHKFm+KKVqIisdg9/CZwm9ftvKXLP8NRWj38/ODjNbr43SsoXqNuAisEf1GdCxqWcdBw=="],
|
|
109
112
|
|
|
110
113
|
"uhyphen": ["uhyphen@0.2.0", "", {}, "sha512-qz3o9CHXmJJPGBdqzab7qAYuW8kQGKNEuoHFYrBwV6hWIMcpAmxDLXojcHfFr9US1Pe6zUswEIJIbLI610fuqA=="],
|
|
111
114
|
|
package/dist/index.js
CHANGED
|
@@ -23300,7 +23300,9 @@ class Crawler {
|
|
|
23300
23300
|
return res;
|
|
23301
23301
|
}
|
|
23302
23302
|
]
|
|
23303
|
-
}
|
|
23303
|
+
},
|
|
23304
|
+
followRedirect: false,
|
|
23305
|
+
cookieJar: instance.cookieJar
|
|
23304
23306
|
});
|
|
23305
23307
|
return instance;
|
|
23306
23308
|
}
|
package/index.d.ts
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
// crawler.d.ts
|
|
2
|
+
|
|
3
|
+
import type { Got } from "got";
|
|
4
|
+
import type { CookieJar } from "tough-cookie";
|
|
5
|
+
|
|
6
|
+
export declare class Crawler {
|
|
7
|
+
client: Got;
|
|
8
|
+
cookieJar: CookieJar;
|
|
9
|
+
html: string;
|
|
10
|
+
document: Document;
|
|
11
|
+
|
|
12
|
+
static factory(): Crawler;
|
|
13
|
+
|
|
14
|
+
removeAllCookies(): Promise<void>;
|
|
15
|
+
}
|
package/index.test.ts
CHANGED
package/index.ts
CHANGED
|
@@ -8,7 +8,7 @@ export class Crawler {
|
|
|
8
8
|
client!: Got
|
|
9
9
|
cookieJar!: CookieJar
|
|
10
10
|
html = ``
|
|
11
|
-
document = parseHTML(``).document
|
|
11
|
+
document: Document = parseHTML(``).document
|
|
12
12
|
|
|
13
13
|
static factory() {
|
|
14
14
|
const instance = new Crawler()
|
|
@@ -27,9 +27,9 @@ export class Crawler {
|
|
|
27
27
|
return res
|
|
28
28
|
}
|
|
29
29
|
]
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
|
|
30
|
+
},
|
|
31
|
+
followRedirect:false,
|
|
32
|
+
cookieJar: instance.cookieJar,
|
|
33
33
|
|
|
34
34
|
})
|
|
35
35
|
|
package/package.json
CHANGED
|
@@ -1,19 +1,24 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@wargas/crawler",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.5",
|
|
4
4
|
"module": "index.ts",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"type": "module",
|
|
7
7
|
"private": false,
|
|
8
|
+
"repository": {
|
|
9
|
+
"type": "git",
|
|
10
|
+
"url": "https://github.com/wargas/crawler.git"
|
|
11
|
+
},
|
|
8
12
|
"scripts": {
|
|
9
13
|
"build": "bun build index.ts --target node --outdir dist",
|
|
10
14
|
"publish": "npm publish --tag latest --access public"
|
|
11
15
|
},
|
|
12
16
|
"devDependencies": {
|
|
13
|
-
"@types/bun": "latest"
|
|
17
|
+
"@types/bun": "latest",
|
|
18
|
+
"tsc": "^2.0.4"
|
|
14
19
|
},
|
|
15
20
|
"peerDependencies": {
|
|
16
|
-
"typescript": "^
|
|
21
|
+
"typescript": "^6.0.3"
|
|
17
22
|
},
|
|
18
23
|
"dependencies": {
|
|
19
24
|
"got": "^15.0.5",
|