@ooneex/html 0.0.18 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +285 -0
- package/dist/index.d.ts +11 -11
- package/dist/index.js +2 -2
- package/dist/index.js.map +3 -3
- package/package.json +5 -5
package/README.md
CHANGED
|
@@ -1 +1,286 @@
|
|
|
1
1
|
# @ooneex/html
|
|
2
|
+
|
|
3
|
+
HTML parsing and DOM manipulation toolkit powered by Cheerio -- extract, transform, and query HTML content with a jQuery-like API.
|
|
4
|
+
|
|
5
|
+

|
|
6
|
+

|
|
7
|
+

|
|
8
|
+
|
|
9
|
+
## Features
|
|
10
|
+
|
|
11
|
+
- **Cheerio Powered** - Built on Cheerio for fast, reliable HTML parsing with a jQuery-like API
|
|
12
|
+
- **Load from String or URL** - Parse HTML from raw strings or fetch and parse directly from URLs
|
|
13
|
+
- **Content Extraction** - Retrieve plain text content or full HTML output from parsed documents
|
|
14
|
+
- **Image Extraction** - Extract all images with their src, alt, title, width, and height attributes
|
|
15
|
+
- **Link Extraction** - Extract all anchor links with href, text, title, target, and rel attributes
|
|
16
|
+
- **Heading Extraction** - Extract all headings (h1-h6) with level, text, and id information
|
|
17
|
+
- **Video Extraction** - Extract video elements with sources, poster, controls, and playback attributes
|
|
18
|
+
- **Task Extraction** - Extract checkbox task items with text and checked state
|
|
19
|
+
- **Type-Safe** - Full TypeScript support with proper type definitions for all extracted data
|
|
20
|
+
- **Method Chaining** - Fluent API with chainable load methods
|
|
21
|
+
|
|
22
|
+
## Installation
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
bun add @ooneex/html
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Usage
|
|
29
|
+
|
|
30
|
+
### Basic Usage
|
|
31
|
+
|
|
32
|
+
```typescript
|
|
33
|
+
import { Html } from '@ooneex/html';
|
|
34
|
+
|
|
35
|
+
// Parse HTML from a string
|
|
36
|
+
const html = new Html('<h1>Hello World</h1><p>Some content</p>');
|
|
37
|
+
|
|
38
|
+
console.log(html.getContent()); // "Hello WorldSome content"
|
|
39
|
+
console.log(html.getHtml()); // Full HTML output
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
### Loading HTML
|
|
43
|
+
|
|
44
|
+
```typescript
|
|
45
|
+
import { Html } from '@ooneex/html';
|
|
46
|
+
|
|
47
|
+
const html = new Html();
|
|
48
|
+
|
|
49
|
+
// Load from string
|
|
50
|
+
html.load('<div><a href="/link">Click here</a></div>');
|
|
51
|
+
|
|
52
|
+
// Load from URL
|
|
53
|
+
await html.loadUrl('https://example.com');
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### Extracting Images
|
|
57
|
+
|
|
58
|
+
```typescript
|
|
59
|
+
import { Html } from '@ooneex/html';
|
|
60
|
+
|
|
61
|
+
const html = new Html(`
|
|
62
|
+
<img src="photo.jpg" alt="A photo" width="800" height="600" />
|
|
63
|
+
<img src="logo.png" alt="Logo" />
|
|
64
|
+
`);
|
|
65
|
+
|
|
66
|
+
const images = html.getImages();
|
|
67
|
+
// [
|
|
68
|
+
// { src: 'photo.jpg', alt: 'A photo', title: null, width: '800', height: '600' },
|
|
69
|
+
// { src: 'logo.png', alt: 'Logo', title: null, width: null, height: null }
|
|
70
|
+
// ]
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### Extracting Links
|
|
74
|
+
|
|
75
|
+
```typescript
|
|
76
|
+
import { Html } from '@ooneex/html';
|
|
77
|
+
|
|
78
|
+
const html = new Html(`
|
|
79
|
+
<a href="/about" title="About us">About</a>
|
|
80
|
+
<a href="https://example.com" target="_blank" rel="noopener">External</a>
|
|
81
|
+
`);
|
|
82
|
+
|
|
83
|
+
const links = html.getLinks();
|
|
84
|
+
// [
|
|
85
|
+
// { href: '/about', text: 'About', title: 'About us', target: null, rel: null },
|
|
86
|
+
// { href: 'https://example.com', text: 'External', title: null, target: '_blank', rel: 'noopener' }
|
|
87
|
+
// ]
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Extracting Headings
|
|
91
|
+
|
|
92
|
+
```typescript
|
|
93
|
+
import { Html } from '@ooneex/html';
|
|
94
|
+
|
|
95
|
+
const html = new Html(`
|
|
96
|
+
<h1 id="title">Main Title</h1>
|
|
97
|
+
<h2>Section One</h2>
|
|
98
|
+
<h3 id="sub">Subsection</h3>
|
|
99
|
+
`);
|
|
100
|
+
|
|
101
|
+
const headings = html.getHeadings();
|
|
102
|
+
// [
|
|
103
|
+
// { level: 1, text: 'Main Title', id: 'title' },
|
|
104
|
+
// { level: 2, text: 'Section One', id: null },
|
|
105
|
+
// { level: 3, text: 'Subsection', id: 'sub' }
|
|
106
|
+
// ]
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Extracting Videos
|
|
110
|
+
|
|
111
|
+
```typescript
|
|
112
|
+
import { Html } from '@ooneex/html';
|
|
113
|
+
|
|
114
|
+
const html = new Html(`
|
|
115
|
+
<video poster="thumb.jpg" controls>
|
|
116
|
+
<source src="video.mp4" type="video/mp4" />
|
|
117
|
+
<source src="video.webm" type="video/webm" />
|
|
118
|
+
</video>
|
|
119
|
+
`);
|
|
120
|
+
|
|
121
|
+
const videos = html.getVideos();
|
|
122
|
+
// [
|
|
123
|
+
// {
|
|
124
|
+
// src: null, poster: 'thumb.jpg', width: null, height: null,
|
|
125
|
+
// controls: true, autoplay: false, loop: false, muted: false,
|
|
126
|
+
// sources: [
|
|
127
|
+
// { src: 'video.mp4', type: 'video/mp4' },
|
|
128
|
+
// { src: 'video.webm', type: 'video/webm' }
|
|
129
|
+
// ]
|
|
130
|
+
// }
|
|
131
|
+
// ]
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### Extracting Tasks
|
|
135
|
+
|
|
136
|
+
```typescript
|
|
137
|
+
import { Html } from '@ooneex/html';
|
|
138
|
+
|
|
139
|
+
const html = new Html(`
|
|
140
|
+
<ul>
|
|
141
|
+
<li><input type="checkbox" checked /> Done task</li>
|
|
142
|
+
<li><input type="checkbox" /> Pending task</li>
|
|
143
|
+
</ul>
|
|
144
|
+
`);
|
|
145
|
+
|
|
146
|
+
const tasks = html.getTasks();
|
|
147
|
+
// [
|
|
148
|
+
// { text: 'Done task', checked: true },
|
|
149
|
+
// { text: 'Pending task', checked: false }
|
|
150
|
+
// ]
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## API Reference
|
|
154
|
+
|
|
155
|
+
### `Html` Class
|
|
156
|
+
|
|
157
|
+
The main class for parsing and analyzing HTML documents.
|
|
158
|
+
|
|
159
|
+
#### Constructor
|
|
160
|
+
|
|
161
|
+
**`new Html(html?: string)`**
|
|
162
|
+
|
|
163
|
+
Creates a new Html instance, optionally parsing the provided HTML string.
|
|
164
|
+
|
|
165
|
+
**Parameters:**
|
|
166
|
+
- `html` - Optional HTML string to parse
|
|
167
|
+
|
|
168
|
+
#### Methods
|
|
169
|
+
|
|
170
|
+
**`load(html: string): this`**
|
|
171
|
+
|
|
172
|
+
Load and parse HTML from a string. Returns the instance for chaining.
|
|
173
|
+
|
|
174
|
+
**`loadUrl(url: string | URL): Promise<this>`**
|
|
175
|
+
|
|
176
|
+
Fetch and parse HTML from a URL. Returns a promise resolving to the instance.
|
|
177
|
+
|
|
178
|
+
**`getContent(): string`**
|
|
179
|
+
|
|
180
|
+
Get the plain text content of the parsed document.
|
|
181
|
+
|
|
182
|
+
**`getHtml(): string`**
|
|
183
|
+
|
|
184
|
+
Get the full HTML string of the parsed document.
|
|
185
|
+
|
|
186
|
+
**`getImages(): HtmlImageType[]`**
|
|
187
|
+
|
|
188
|
+
Extract all `<img>` elements with their attributes.
|
|
189
|
+
|
|
190
|
+
**`getLinks(): HtmlLinkType[]`**
|
|
191
|
+
|
|
192
|
+
Extract all `<a>` elements with their attributes.
|
|
193
|
+
|
|
194
|
+
**`getHeadings(): HtmlHeadingType[]`**
|
|
195
|
+
|
|
196
|
+
Extract all heading elements (h1-h6) with level, text, and id.
|
|
197
|
+
|
|
198
|
+
**`getVideos(): HtmlVideoType[]`**
|
|
199
|
+
|
|
200
|
+
Extract all `<video>` elements with their sources and attributes.
|
|
201
|
+
|
|
202
|
+
**`getTasks(): HtmlTaskType[]`**
|
|
203
|
+
|
|
204
|
+
Extract all checkbox task items with text and checked state.
|
|
205
|
+
|
|
206
|
+
### Type Definitions
|
|
207
|
+
|
|
208
|
+
#### `HtmlImageType`
|
|
209
|
+
```typescript
|
|
210
|
+
type HtmlImageType = {
|
|
211
|
+
src: string;
|
|
212
|
+
alt: string | null;
|
|
213
|
+
title: string | null;
|
|
214
|
+
width: string | null;
|
|
215
|
+
height: string | null;
|
|
216
|
+
};
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
#### `HtmlLinkType`
|
|
220
|
+
```typescript
|
|
221
|
+
type HtmlLinkType = {
|
|
222
|
+
href: string;
|
|
223
|
+
text: string | null;
|
|
224
|
+
title: string | null;
|
|
225
|
+
target: string | null;
|
|
226
|
+
rel: string | null;
|
|
227
|
+
};
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
#### `HtmlHeadingType`
|
|
231
|
+
```typescript
|
|
232
|
+
type HtmlHeadingType = {
|
|
233
|
+
level: number;
|
|
234
|
+
text: string;
|
|
235
|
+
id: string | null;
|
|
236
|
+
};
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
#### `HtmlVideoType`
|
|
240
|
+
```typescript
|
|
241
|
+
type HtmlVideoType = {
|
|
242
|
+
src: string | null;
|
|
243
|
+
poster: string | null;
|
|
244
|
+
width: string | null;
|
|
245
|
+
height: string | null;
|
|
246
|
+
controls: boolean;
|
|
247
|
+
autoplay: boolean;
|
|
248
|
+
loop: boolean;
|
|
249
|
+
muted: boolean;
|
|
250
|
+
sources: { src: string; type: string | null }[];
|
|
251
|
+
};
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
#### `HtmlTaskType`
|
|
255
|
+
```typescript
|
|
256
|
+
type HtmlTaskType = {
|
|
257
|
+
text: string;
|
|
258
|
+
checked: boolean;
|
|
259
|
+
};
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
## License
|
|
263
|
+
|
|
264
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
265
|
+
|
|
266
|
+
## Contributing
|
|
267
|
+
|
|
268
|
+
Contributions are welcome! Please feel free to submit a Pull Request. For major changes, please open an issue first to discuss what you would like to change.
|
|
269
|
+
|
|
270
|
+
### Development Setup
|
|
271
|
+
|
|
272
|
+
1. Clone the repository
|
|
273
|
+
2. Install dependencies: `bun install`
|
|
274
|
+
3. Run tests: `bun run test`
|
|
275
|
+
4. Build the project: `bun run build`
|
|
276
|
+
|
|
277
|
+
### Guidelines
|
|
278
|
+
|
|
279
|
+
- Write tests for new features
|
|
280
|
+
- Follow the existing code style
|
|
281
|
+
- Update documentation for API changes
|
|
282
|
+
- Ensure all tests pass before submitting PR
|
|
283
|
+
|
|
284
|
+
---
|
|
285
|
+
|
|
286
|
+
Made with love by the Ooneex team
|
package/dist/index.d.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Heading information extracted from HTML
|
|
3
3
|
*/
|
|
4
|
-
|
|
4
|
+
type HtmlHeadingType = {
|
|
5
5
|
/**
|
|
6
6
|
* Heading level (1-6)
|
|
7
7
|
*/
|
|
@@ -14,11 +14,11 @@ interface HtmlHeadingType {
|
|
|
14
14
|
* Heading id attribute
|
|
15
15
|
*/
|
|
16
16
|
id: string | null;
|
|
17
|
-
}
|
|
17
|
+
};
|
|
18
18
|
/**
|
|
19
19
|
* Link information extracted from HTML
|
|
20
20
|
*/
|
|
21
|
-
|
|
21
|
+
type HtmlLinkType = {
|
|
22
22
|
/**
|
|
23
23
|
* Link href URL
|
|
24
24
|
*/
|
|
@@ -39,11 +39,11 @@ interface HtmlLinkType {
|
|
|
39
39
|
* Link rel attribute
|
|
40
40
|
*/
|
|
41
41
|
rel: string | null;
|
|
42
|
-
}
|
|
42
|
+
};
|
|
43
43
|
/**
|
|
44
44
|
* Image information extracted from HTML
|
|
45
45
|
*/
|
|
46
|
-
|
|
46
|
+
type HtmlImageType = {
|
|
47
47
|
/**
|
|
48
48
|
* Image source URL
|
|
49
49
|
*/
|
|
@@ -64,11 +64,11 @@ interface HtmlImageType {
|
|
|
64
64
|
* Image height attribute
|
|
65
65
|
*/
|
|
66
66
|
height: string | null;
|
|
67
|
-
}
|
|
67
|
+
};
|
|
68
68
|
/**
|
|
69
69
|
* Task information extracted from HTML (checkbox list items)
|
|
70
70
|
*/
|
|
71
|
-
|
|
71
|
+
type HtmlTaskType = {
|
|
72
72
|
/**
|
|
73
73
|
* Task text content
|
|
74
74
|
*/
|
|
@@ -77,11 +77,11 @@ interface HtmlTaskType {
|
|
|
77
77
|
* Whether the task is checked/completed
|
|
78
78
|
*/
|
|
79
79
|
checked: boolean;
|
|
80
|
-
}
|
|
80
|
+
};
|
|
81
81
|
/**
|
|
82
82
|
* Video information extracted from HTML
|
|
83
83
|
*/
|
|
84
|
-
|
|
84
|
+
type HtmlVideoType = {
|
|
85
85
|
/**
|
|
86
86
|
* Video source URL
|
|
87
87
|
*/
|
|
@@ -121,7 +121,7 @@ interface HtmlVideoType {
|
|
|
121
121
|
src: string;
|
|
122
122
|
type: string | null;
|
|
123
123
|
}[];
|
|
124
|
-
}
|
|
124
|
+
};
|
|
125
125
|
/**
|
|
126
126
|
* Interface for HTML class
|
|
127
127
|
*/
|
|
@@ -179,7 +179,7 @@ interface IHtml {
|
|
|
179
179
|
*/
|
|
180
180
|
declare class Html implements IHtml {
|
|
181
181
|
private $;
|
|
182
|
-
constructor();
|
|
182
|
+
constructor(html?: string);
|
|
183
183
|
/**
|
|
184
184
|
* Load HTML from a string
|
|
185
185
|
* @param html - HTML string to parse
|
package/dist/index.js
CHANGED
|
@@ -1,3 +1,3 @@
|
|
|
1
|
-
import*as j from"cheerio";import{Exception as G}from"@ooneex/exception";import{HttpStatus as J}from"@ooneex/http-status";class x extends G{constructor(w,C={}){super(w,{status:J.Code.InternalServerError,data:C});this.name="HtmlException"}}class F{$;constructor(){this.$=j.load("")}load(w){return this.$=j.load(w),this}async loadUrl(w){let C=w instanceof URL?w.toString():w;try{return this.$=await j.fromURL(C),this}catch(R){throw new x(`Failed to fetch URL: ${C}`,{status:500,data:{url:C,error:R instanceof Error?R.message:String(R)}})}}getContent(){return this.$.text().trim()}getHtml(){return this.$.html().trim()??""}getImages(){let w=this.$,C=[];return w("img").each((R,I)=>{let p=w(I),H=p.attr("src");if(H)C.push({src:H,alt:p.attr("alt")||null,title:p.attr("title")||null,width:p.attr("width")||null,height:p.attr("height")||null})}),C}getLinks(){let w=this.$,C=[];return w("a").each((R,I)=>{let p=w(I),H=p.attr("href");if(H)C.push({href:H,text:p.text().trim()||null,title:p.attr("title")||null,target:p.attr("target")||null,rel:p.attr("rel")||null})}),C}getHeadings(){let w=this.$,C=[];return w("h1, h2, h3, h4, h5, h6").each((R,I)=>{let p=w(I),H=I.tagName.toLowerCase(),q=Number.parseInt(H.charAt(1),10);C.push({level:q,text:p.text().trim(),id:p.attr("id")||null})}),C}getVideos(){let w=this.$,C=[];return w("video").each((R,I)=>{let p=w(I),H=[];p.find("source").each((q,z)=>{let B=w(z),D=B.attr("src");if(D)H.push({src:D,type:B.attr("type")||null})}),C.push({src:p.attr("src")||null,poster:p.attr("poster")||null,width:p.attr("width")||null,height:p.attr("height")||null,controls:p.attr("controls")!==void 0,autoplay:p.attr("autoplay")!==void 0,loop:p.attr("loop")!==void 0,muted:p.attr("muted")!==void 0,sources:H})}),C}getTasks(){let w=this.$,C=[];return w('input[type="checkbox"]').each((R,I)=>{let p=w(I),H=p.parent(),q=p.attr("checked")!==void 0,z=H.text().trim();C.push({text:z,checked:q})}),C}}export{x as HtmlException,F as Html};
|
|
1
|
+
import*as j from"cheerio";import{Exception as G}from"@ooneex/exception";import{HttpStatus as J}from"@ooneex/http-status";class x extends G{constructor(w,C={}){super(w,{status:J.Code.InternalServerError,data:C});this.name="HtmlException"}}class F{$;constructor(w){this.$=j.load(w??"")}load(w){return this.$=j.load(w),this}async loadUrl(w){let C=w instanceof URL?w.toString():w;try{return this.$=await j.fromURL(C),this}catch(R){throw new x(`Failed to fetch URL: ${C}`,{status:500,data:{url:C,error:R instanceof Error?R.message:String(R)}})}}getContent(){return this.$.text().trim()}getHtml(){return this.$.html().trim()??""}getImages(){let w=this.$,C=[];return w("img").each((R,I)=>{let p=w(I),H=p.attr("src");if(H)C.push({src:H,alt:p.attr("alt")||null,title:p.attr("title")||null,width:p.attr("width")||null,height:p.attr("height")||null})}),C}getLinks(){let w=this.$,C=[];return w("a").each((R,I)=>{let p=w(I),H=p.attr("href");if(H)C.push({href:H,text:p.text().trim()||null,title:p.attr("title")||null,target:p.attr("target")||null,rel:p.attr("rel")||null})}),C}getHeadings(){let w=this.$,C=[];return w("h1, h2, h3, h4, h5, h6").each((R,I)=>{let p=w(I),H=I.tagName.toLowerCase(),q=Number.parseInt(H.charAt(1),10);C.push({level:q,text:p.text().trim(),id:p.attr("id")||null})}),C}getVideos(){let w=this.$,C=[];return w("video").each((R,I)=>{let p=w(I),H=[];p.find("source").each((q,z)=>{let B=w(z),D=B.attr("src");if(D)H.push({src:D,type:B.attr("type")||null})}),C.push({src:p.attr("src")||null,poster:p.attr("poster")||null,width:p.attr("width")||null,height:p.attr("height")||null,controls:p.attr("controls")!==void 0,autoplay:p.attr("autoplay")!==void 0,loop:p.attr("loop")!==void 0,muted:p.attr("muted")!==void 0,sources:H})}),C}getTasks(){let w=this.$,C=[];return w('input[type="checkbox"]').each((R,I)=>{let p=w(I),H=p.parent(),q=p.attr("checked")!==void 0,z=H.text().trim();C.push({text:z,checked:q})}),C}}export{x as HtmlException,F as Html};
|
|
2
2
|
|
|
3
|
-
//# debugId=
|
|
3
|
+
//# debugId=7AFABA5DDA7D976364756E2164756E21
|
package/dist/index.js.map
CHANGED
|
@@ -2,10 +2,10 @@
|
|
|
2
2
|
"version": 3,
|
|
3
3
|
"sources": ["src/Html.ts", "src/HtmlException.ts"],
|
|
4
4
|
"sourcesContent": [
|
|
5
|
-
"import type { CheerioAPI } from \"cheerio\";\nimport * as cheerio from \"cheerio\";\nimport { HtmlException } from \"./HtmlException\";\nimport type { HtmlHeadingType, HtmlImageType, HtmlLinkType, HtmlTaskType, HtmlVideoType, IHtml } from \"./types\";\n\n/**\n * HTML document parser and analyzer using Cheerio\n */\nexport class Html implements IHtml {\n private $: CheerioAPI;\n\n constructor() {\n this.$ = cheerio.load(\"\");\n }\n\n /**\n * Load HTML from a string\n * @param html - HTML string to parse\n * @returns this instance for chaining\n */\n public load(html: string): this {\n this.$ = cheerio.load(html);\n return this;\n }\n\n /**\n * Load HTML from a URL using Cheerio's fromURL method\n * @param url - URL to fetch HTML from\n * @returns Promise resolving to this instance for chaining\n */\n public async loadUrl(url: string | URL): Promise<this> {\n const urlString = url instanceof URL ? url.toString() : url;\n\n try {\n this.$ = await cheerio.fromURL(urlString);\n return this;\n } catch (error) {\n throw new HtmlException(`Failed to fetch URL: ${urlString}`, {\n status: 500,\n data: {\n url: urlString,\n error: error instanceof Error ? error.message : String(error),\n },\n });\n }\n }\n\n /**\n * Get the text content of the HTML document\n * @returns Trimmed text content\n */\n public getContent(): string {\n return this.$.text().trim();\n }\n\n /**\n * Get the full HTML string of the document\n * @returns HTML string\n */\n public getHtml(): string {\n return this.$.html().trim() ?? \"\";\n }\n\n /**\n * Extract all images from the HTML document\n * @returns Array of image information\n */\n public getImages(): HtmlImageType[] {\n const $ = this.$;\n const images: HtmlImageType[] = [];\n\n $(\"img\").each((_, element) => {\n const $img = $(element);\n const src = $img.attr(\"src\");\n\n if (src) {\n images.push({\n src,\n alt: $img.attr(\"alt\") || null,\n title: $img.attr(\"title\") || null,\n width: $img.attr(\"width\") || null,\n height: $img.attr(\"height\") || null,\n });\n }\n });\n\n return images;\n }\n\n /**\n * Extract all links from the HTML document\n * @returns Array of link information\n */\n public getLinks(): HtmlLinkType[] {\n const $ = this.$;\n const links: HtmlLinkType[] = [];\n\n $(\"a\").each((_, element) => {\n const $link = $(element);\n const href = $link.attr(\"href\");\n\n if (href) {\n links.push({\n href,\n text: $link.text().trim() || null,\n title: $link.attr(\"title\") || null,\n target: $link.attr(\"target\") || null,\n rel: $link.attr(\"rel\") || null,\n });\n }\n });\n\n return links;\n }\n\n /**\n * Extract all headings from the HTML document\n * @returns Array of heading information\n */\n public getHeadings(): HtmlHeadingType[] {\n const $ = this.$;\n const headings: HtmlHeadingType[] = [];\n\n $(\"h1, h2, h3, h4, h5, h6\").each((_, element) => {\n const $heading = $(element);\n const tagName = element.tagName.toLowerCase();\n const level = Number.parseInt(tagName.charAt(1), 10);\n\n headings.push({\n level,\n text: $heading.text().trim(),\n id: $heading.attr(\"id\") || null,\n });\n });\n\n return headings;\n }\n\n /**\n * Extract all videos from the HTML document\n * @returns Array of video information\n */\n public getVideos(): HtmlVideoType[] {\n const $ = this.$;\n const videos: HtmlVideoType[] = [];\n\n $(\"video\").each((_, element) => {\n const $video = $(element);\n const sources: { src: string; type: string | null }[] = [];\n\n $video.find(\"source\").each((_, sourceElement) => {\n const $source = $(sourceElement);\n const src = $source.attr(\"src\");\n\n if (src) {\n sources.push({\n src,\n type: $source.attr(\"type\") || null,\n });\n }\n });\n\n videos.push({\n src: $video.attr(\"src\") || null,\n poster: $video.attr(\"poster\") || null,\n width: $video.attr(\"width\") || null,\n height: $video.attr(\"height\") || null,\n controls: $video.attr(\"controls\") !== undefined,\n autoplay: $video.attr(\"autoplay\") !== undefined,\n loop: $video.attr(\"loop\") !== undefined,\n muted: $video.attr(\"muted\") !== undefined,\n sources,\n });\n });\n\n return videos;\n }\n\n /**\n * Extract all tasks (checkbox list items) from the HTML document\n * @returns Array of task information\n */\n public getTasks(): HtmlTaskType[] {\n const $ = this.$;\n const tasks: HtmlTaskType[] = [];\n\n $('input[type=\"checkbox\"]').each((_, element) => {\n const $checkbox = $(element);\n const $parent = $checkbox.parent();\n const checked = $checkbox.attr(\"checked\") !== undefined;\n\n const text = $parent.text().trim();\n\n tasks.push({\n text,\n checked,\n });\n });\n\n return tasks;\n }\n}\n",
|
|
5
|
+
"import type { CheerioAPI } from \"cheerio\";\nimport * as cheerio from \"cheerio\";\nimport { HtmlException } from \"./HtmlException\";\nimport type { HtmlHeadingType, HtmlImageType, HtmlLinkType, HtmlTaskType, HtmlVideoType, IHtml } from \"./types\";\n\n/**\n * HTML document parser and analyzer using Cheerio\n */\nexport class Html implements IHtml {\n private $: CheerioAPI;\n\n constructor(html?: string) {\n this.$ = cheerio.load(html ?? \"\");\n }\n\n /**\n * Load HTML from a string\n * @param html - HTML string to parse\n * @returns this instance for chaining\n */\n public load(html: string): this {\n this.$ = cheerio.load(html);\n return this;\n }\n\n /**\n * Load HTML from a URL using Cheerio's fromURL method\n * @param url - URL to fetch HTML from\n * @returns Promise resolving to this instance for chaining\n */\n public async loadUrl(url: string | URL): Promise<this> {\n const urlString = url instanceof URL ? url.toString() : url;\n\n try {\n this.$ = await cheerio.fromURL(urlString);\n return this;\n } catch (error) {\n throw new HtmlException(`Failed to fetch URL: ${urlString}`, {\n status: 500,\n data: {\n url: urlString,\n error: error instanceof Error ? error.message : String(error),\n },\n });\n }\n }\n\n /**\n * Get the text content of the HTML document\n * @returns Trimmed text content\n */\n public getContent(): string {\n return this.$.text().trim();\n }\n\n /**\n * Get the full HTML string of the document\n * @returns HTML string\n */\n public getHtml(): string {\n return this.$.html().trim() ?? \"\";\n }\n\n /**\n * Extract all images from the HTML document\n * @returns Array of image information\n */\n public getImages(): HtmlImageType[] {\n const $ = this.$;\n const images: HtmlImageType[] = [];\n\n $(\"img\").each((_, element) => {\n const $img = $(element);\n const src = $img.attr(\"src\");\n\n if (src) {\n images.push({\n src,\n alt: $img.attr(\"alt\") || null,\n title: $img.attr(\"title\") || null,\n width: $img.attr(\"width\") || null,\n height: $img.attr(\"height\") || null,\n });\n }\n });\n\n return images;\n }\n\n /**\n * Extract all links from the HTML document\n * @returns Array of link information\n */\n public getLinks(): HtmlLinkType[] {\n const $ = this.$;\n const links: HtmlLinkType[] = [];\n\n $(\"a\").each((_, element) => {\n const $link = $(element);\n const href = $link.attr(\"href\");\n\n if (href) {\n links.push({\n href,\n text: $link.text().trim() || null,\n title: $link.attr(\"title\") || null,\n target: $link.attr(\"target\") || null,\n rel: $link.attr(\"rel\") || null,\n });\n }\n });\n\n return links;\n }\n\n /**\n * Extract all headings from the HTML document\n * @returns Array of heading information\n */\n public getHeadings(): HtmlHeadingType[] {\n const $ = this.$;\n const headings: HtmlHeadingType[] = [];\n\n $(\"h1, h2, h3, h4, h5, h6\").each((_, element) => {\n const $heading = $(element);\n const tagName = element.tagName.toLowerCase();\n const level = Number.parseInt(tagName.charAt(1), 10);\n\n headings.push({\n level,\n text: $heading.text().trim(),\n id: $heading.attr(\"id\") || null,\n });\n });\n\n return headings;\n }\n\n /**\n * Extract all videos from the HTML document\n * @returns Array of video information\n */\n public getVideos(): HtmlVideoType[] {\n const $ = this.$;\n const videos: HtmlVideoType[] = [];\n\n $(\"video\").each((_, element) => {\n const $video = $(element);\n const sources: { src: string; type: string | null }[] = [];\n\n $video.find(\"source\").each((_, sourceElement) => {\n const $source = $(sourceElement);\n const src = $source.attr(\"src\");\n\n if (src) {\n sources.push({\n src,\n type: $source.attr(\"type\") || null,\n });\n }\n });\n\n videos.push({\n src: $video.attr(\"src\") || null,\n poster: $video.attr(\"poster\") || null,\n width: $video.attr(\"width\") || null,\n height: $video.attr(\"height\") || null,\n controls: $video.attr(\"controls\") !== undefined,\n autoplay: $video.attr(\"autoplay\") !== undefined,\n loop: $video.attr(\"loop\") !== undefined,\n muted: $video.attr(\"muted\") !== undefined,\n sources,\n });\n });\n\n return videos;\n }\n\n /**\n * Extract all tasks (checkbox list items) from the HTML document\n * @returns Array of task information\n */\n public getTasks(): HtmlTaskType[] {\n const $ = this.$;\n const tasks: HtmlTaskType[] = [];\n\n $('input[type=\"checkbox\"]').each((_, element) => {\n const $checkbox = $(element);\n const $parent = $checkbox.parent();\n const checked = $checkbox.attr(\"checked\") !== undefined;\n\n const text = $parent.text().trim();\n\n tasks.push({\n text,\n checked,\n });\n });\n\n return tasks;\n }\n}\n",
|
|
6
6
|
"import { Exception } from \"@ooneex/exception\";\nimport { HttpStatus } from \"@ooneex/http-status\";\n\nexport class HtmlException extends Exception {\n constructor(message: string, data: Record<string, unknown> = {}) {\n super(message, {\n status: HttpStatus.Code.InternalServerError,\n data,\n });\n this.name = \"HtmlException\";\n }\n}\n"
|
|
7
7
|
],
|
|
8
|
-
"mappings": "AACA,0BCDA,oBAAS,0BACT,qBAAS,4BAEF,MAAM,UAAsB,CAAU,CAC3C,WAAW,CAAC,EAAiB,EAAgC,CAAC,EAAG,CAC/D,MAAM,EAAS,CACb,OAAQ,EAAW,KAAK,oBACxB,MACF,CAAC,EACD,KAAK,KAAO,gBAEhB,CDHO,MAAM,CAAsB,CACzB,EAER,WAAW,
|
|
9
|
-
"debugId": "
|
|
8
|
+
"mappings": "AACA,0BCDA,oBAAS,0BACT,qBAAS,4BAEF,MAAM,UAAsB,CAAU,CAC3C,WAAW,CAAC,EAAiB,EAAgC,CAAC,EAAG,CAC/D,MAAM,EAAS,CACb,OAAQ,EAAW,KAAK,oBACxB,MACF,CAAC,EACD,KAAK,KAAO,gBAEhB,CDHO,MAAM,CAAsB,CACzB,EAER,WAAW,CAAC,EAAe,CACzB,KAAK,EAAY,OAAK,GAAQ,EAAE,EAQ3B,IAAI,CAAC,EAAoB,CAE9B,OADA,KAAK,EAAY,OAAK,CAAI,EACnB,UAQI,QAAO,CAAC,EAAkC,CACrD,IAAM,EAAY,aAAe,IAAM,EAAI,SAAS,EAAI,EAExD,GAAI,CAEF,OADA,KAAK,EAAI,MAAc,UAAQ,CAAS,EACjC,KACP,MAAO,EAAO,CACd,MAAM,IAAI,EAAc,wBAAwB,IAAa,CAC3D,OAAQ,IACR,KAAM,CACJ,IAAK,EACL,MAAO,aAAiB,MAAQ,EAAM,QAAU,OAAO,CAAK,CAC9D,CACF,CAAC,GAQE,UAAU,EAAW,CAC1B,OAAO,KAAK,EAAE,KAAK,EAAE,KAAK,EAOrB,OAAO,EAAW,CACvB,OAAO,KAAK,EAAE,KAAK,EAAE,KAAK,GAAK,GAO1B,SAAS,EAAoB,CAClC,IAAM,EAAI,KAAK,EACT,EAA0B,CAAC,EAiBjC,OAfA,EAAE,KAAK,EAAE,KAAK,CAAC,EAAG,IAAY,CAC5B,IAAM,EAAO,EAAE,CAAO,EAChB,EAAM,EAAK,KAAK,KAAK,EAE3B,GAAI,EACF,EAAO,KAAK,CACV,MACA,IAAK,EAAK,KAAK,KAAK,GAAK,KACzB,MAAO,EAAK,KAAK,OAAO,GAAK,KAC7B,MAAO,EAAK,KAAK,OAAO,GAAK,KAC7B,OAAQ,EAAK,KAAK,QAAQ,GAAK,IACjC,CAAC,EAEJ,EAEM,EAOF,QAAQ,EAAmB,CAChC,IAAM,EAAI,KAAK,EACT,EAAwB,CAAC,EAiB/B,OAfA,EAAE,GAAG,EAAE,KAAK,CAAC,EAAG,IAAY,CAC1B,IAAM,EAAQ,EAAE,CAAO,EACjB,EAAO,EAAM,KAAK,MAAM,EAE9B,GAAI,EACF,EAAM,KAAK,CACT,OACA,KAAM,EAAM,KAAK,EAAE,KAAK,GAAK,KAC7B,MAAO,EAAM,KAAK,OAAO,GAAK,KAC9B,OAAQ,EAAM,KAAK,QAAQ,GAAK,KAChC,IAAK,EAAM,KAAK,KAAK,GAAK,IAC5B,CAAC,EAEJ,EAEM,EAOF,WAAW,EAAsB,CACtC,IAAM,EAAI,KAAK,EACT,EAA8B,CAAC,EAcrC,OAZA,EAAE,wBAAwB,EAAE,KAAK,CAAC,EAAG,IAAY,CAC/C,IAAM,EAAW,EAAE,CAAO,EACpB,EAAU,EAAQ,QAAQ,YAAY,EACtC,EAAQ,OAAO,SAAS,EAAQ,OAAO,CAAC,EAAG,EAAE,EAEnD,EAAS,KAAK,CACZ,QACA,KAAM,EAAS,KAAK,EAAE,KAAK,EAC3B,GAAI,EAAS,KAAK,IAAI,GAAK,IAC7B,CAAC,EACF,EAEM,EAOF,SAAS,EAAoB,CAClC,IAAM,EAAI,KAAK,EACT,EAA0B,CAAC,EA+BjC,OA7BA,EAAE,OAAO,EAAE,KAAK,CAAC,EAAG,IAAY,CAC9B,IAAM,EAAS,EAAE,CAAO,EAClB,EAAkD,CAAC,EAEzD,EAAO,KAAK,QAAQ,EAAE,KAAK,CAAC,EAAG,IAAkB,CAC/C,IAAM,EAAU,EAAE,CAAa,EACzB,EAAM,EAAQ,KAAK,KAAK,EAE9B,GAAI,EACF,EAAQ,KAAK,CACX,MACA,KAAM,EAAQ,KAAK,MAAM,GAAK,IAChC,CAAC,EAEJ,EAED,EAAO,KAAK,CACV,IAAK,EAAO,KAAK,KAAK,GAAK,KAC3B,OAAQ,EAAO,KAAK,QAAQ,GAAK,KACjC,MAAO,EAAO,KAAK,OAAO,GAAK,KAC/B,OAAQ,EAAO,KAAK,QAAQ,GAAK,KACjC,SAAU,EAAO,KAAK,UAAU,IAAM,OACtC,SAAU,EAAO,KAAK,UAAU,IAAM,OACtC,KAAM,EAAO,KAAK,MAAM,IAAM,OAC9B,MAAO,EAAO,KAAK,OAAO,IAAM,OAChC,SACF,CAAC,EACF,EAEM,EAOF,QAAQ,EAAmB,CAChC,IAAM,EAAI,KAAK,EACT,EAAwB,CAAC,EAe/B,OAbA,EAAE,wBAAwB,EAAE,KAAK,CAAC,EAAG,IAAY,CAC/C,IAAM,EAAY,EAAE,CAAO,EACrB,EAAU,EAAU,OAAO,EAC3B,EAAU,EAAU,KAAK,SAAS,IAAM,OAExC,EAAO,EAAQ,KAAK,EAAE,KAAK,EAEjC,EAAM,KAAK,CACT,OACA,SACF,CAAC,EACF,EAEM,EAEX",
|
|
9
|
+
"debugId": "7AFABA5DDA7D976364756E2164756E21",
|
|
10
10
|
"names": []
|
|
11
11
|
}
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ooneex/html",
|
|
3
|
-
"description": "HTML parsing and DOM manipulation
|
|
4
|
-
"version": "
|
|
3
|
+
"description": "HTML parsing and DOM manipulation toolkit powered by Cheerio — extract, transform, and query HTML content with a jQuery-like API",
|
|
4
|
+
"version": "1.0.1",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"files": [
|
|
7
7
|
"dist",
|
|
@@ -25,11 +25,11 @@
|
|
|
25
25
|
"test": "bun test tests",
|
|
26
26
|
"build": "bunup",
|
|
27
27
|
"lint": "tsgo --noEmit && bunx biome lint",
|
|
28
|
-
"npm:publish": "bun publish --tolerate-republish --access public"
|
|
28
|
+
"npm:publish": "bun publish --tolerate-republish --force --production --access public"
|
|
29
29
|
},
|
|
30
30
|
"dependencies": {
|
|
31
|
-
"@ooneex/exception": "
|
|
32
|
-
"@ooneex/http-status": "
|
|
31
|
+
"@ooneex/exception": "1.0.1",
|
|
32
|
+
"@ooneex/http-status": "1.0.1",
|
|
33
33
|
"cheerio": "^1.1.2"
|
|
34
34
|
},
|
|
35
35
|
"keywords": [
|