magpie-html 0.1.2 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +35 -35
- package/dist/index.cjs +1787 -18
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +184 -1
- package/dist/index.d.ts +184 -1
- package/dist/index.js +1778 -19
- package/dist/index.js.map +1 -1
- package/package.json +8 -3
package/dist/index.d.cts
CHANGED
|
@@ -3069,4 +3069,187 @@ declare class PluckContentTypeError extends PluckError {
|
|
|
3069
3069
|
*/
|
|
3070
3070
|
declare function pluck(input: string | URL | Request, init?: PluckInit): Promise<PluckResponse>;
|
|
3071
3071
|
|
|
3072
|
-
|
|
3072
|
+
/**
|
|
3073
|
+
* Base error type for `swoop()`.
|
|
3074
|
+
*/
|
|
3075
|
+
declare class SwoopError extends Error {
|
|
3076
|
+
name: string;
|
|
3077
|
+
}
|
|
3078
|
+
/**
|
|
3079
|
+
* Thrown when the current runtime cannot execute `swoop()`.
|
|
3080
|
+
*/
|
|
3081
|
+
declare class SwoopEnvironmentError extends SwoopError {
|
|
3082
|
+
name: string;
|
|
3083
|
+
}
|
|
3084
|
+
/**
|
|
3085
|
+
* Thrown when `swoop()` exceeds its configured timeout.
|
|
3086
|
+
*/
|
|
3087
|
+
declare class SwoopTimeoutError extends SwoopError {
|
|
3088
|
+
name: string;
|
|
3089
|
+
}
|
|
3090
|
+
/**
|
|
3091
|
+
* Thrown when script execution fails in a non-recoverable way.
|
|
3092
|
+
*/
|
|
3093
|
+
declare class SwoopExecutionError extends SwoopError {
|
|
3094
|
+
name: string;
|
|
3095
|
+
}
|
|
3096
|
+
/**
|
|
3097
|
+
* Thrown when `swoop()` is asked to execute potentially unsafe scripts
|
|
3098
|
+
* in a context where the caller should explicitly acknowledge the risk.
|
|
3099
|
+
*/
|
|
3100
|
+
declare class SwoopSecurityError extends SwoopError {
|
|
3101
|
+
name: string;
|
|
3102
|
+
}
|
|
3103
|
+
|
|
3104
|
+
/**
|
|
3105
|
+
* How `swoop()` decides when a client-rendered page is "done enough" to snapshot.
|
|
3106
|
+
*
|
|
3107
|
+
* @remarks
|
|
3108
|
+
* This is DOM-only rendering (no layout/paint). "Done" is best-effort.
|
|
3109
|
+
*/
|
|
3110
|
+
type SwoopWaitStrategy = 'timeout' | 'networkidle';
|
|
3111
|
+
type SwoopEngine = 'vm';
|
|
3112
|
+
interface SwoopInit {
|
|
3113
|
+
/**
|
|
3114
|
+
* Execution engine used for running third-party scripts.
|
|
3115
|
+
*
|
|
3116
|
+
* @remarks
|
|
3117
|
+
* - `vm` (default): practical, supports `fetch` by reusing host globals
|
|
3118
|
+
*
|
|
3119
|
+
* @defaultValue 'vm'
|
|
3120
|
+
*/
|
|
3121
|
+
engine?: SwoopEngine;
|
|
3122
|
+
/**
|
|
3123
|
+
* Pluck options used for the initial HTML request and external script fetching.
|
|
3124
|
+
*/
|
|
3125
|
+
pluck?: PluckInit;
|
|
3126
|
+
/**
|
|
3127
|
+
* Execute inline and external scripts found in the HTML.
|
|
3128
|
+
*
|
|
3129
|
+
* @defaultValue true
|
|
3130
|
+
*/
|
|
3131
|
+
executeScripts?: boolean;
|
|
3132
|
+
/**
|
|
3133
|
+
* Maximum time to wait for the page to "settle" before taking a snapshot.
|
|
3134
|
+
*
|
|
3135
|
+
* @defaultValue 3000
|
|
3136
|
+
*/
|
|
3137
|
+
timeout?: number;
|
|
3138
|
+
/**
|
|
3139
|
+
* Which waiting strategy to use.
|
|
3140
|
+
*
|
|
3141
|
+
* - `timeout`: sleep for `timeout` and snapshot
|
|
3142
|
+
* - `networkidle`: wait until no tracked fetches are pending for `idleTime`
|
|
3143
|
+
*
|
|
3144
|
+
* @defaultValue 'networkidle'
|
|
3145
|
+
*/
|
|
3146
|
+
waitStrategy?: SwoopWaitStrategy;
|
|
3147
|
+
/**
|
|
3148
|
+
* Required quiet period (ms) for `networkidle`.
|
|
3149
|
+
*
|
|
3150
|
+
* @defaultValue 250
|
|
3151
|
+
*/
|
|
3152
|
+
idleTime?: number;
|
|
3153
|
+
/**
|
|
3154
|
+
* Poll interval (ms) for `networkidle`.
|
|
3155
|
+
*
|
|
3156
|
+
* @defaultValue 25
|
|
3157
|
+
*/
|
|
3158
|
+
pollInterval?: number;
|
|
3159
|
+
/**
|
|
3160
|
+
* How many scripts to load/execute at most.
|
|
3161
|
+
*
|
|
3162
|
+
* @defaultValue 64
|
|
3163
|
+
*/
|
|
3164
|
+
maxScripts?: number;
|
|
3165
|
+
/**
|
|
3166
|
+
* If true, forward console output from the isolated realm to the host console.
|
|
3167
|
+
*
|
|
3168
|
+
* @defaultValue false
|
|
3169
|
+
*/
|
|
3170
|
+
forwardConsole?: boolean;
|
|
3171
|
+
/**
|
|
3172
|
+
* If true, installs permissive Proxy-based stubs for common missing browser APIs.
|
|
3173
|
+
*
|
|
3174
|
+
* @remarks
|
|
3175
|
+
* This may hide some failures (by turning hard crashes into no-ops), but improves
|
|
3176
|
+
* compatibility for a best-effort snapshotter.
|
|
3177
|
+
*
|
|
3178
|
+
* @defaultValue true
|
|
3179
|
+
*/
|
|
3180
|
+
permissiveShims?: boolean;
|
|
3181
|
+
/**
|
|
3182
|
+
* Record all sandbox `fetch()` calls into the captured console output.
|
|
3183
|
+
*
|
|
3184
|
+
* @defaultValue false
|
|
3185
|
+
*/
|
|
3186
|
+
debugFetch?: boolean;
|
|
3187
|
+
/**
|
|
3188
|
+
* Enable additional sandbox probes to help iterative shim development.
|
|
3189
|
+
*
|
|
3190
|
+
* @remarks
|
|
3191
|
+
* Collects lightweight runtime stats (DOM ops/mutations, app-root growth, etc.)
|
|
3192
|
+
* and emits them via captured console.
|
|
3193
|
+
*
|
|
3194
|
+
* @defaultValue false
|
|
3195
|
+
*/
|
|
3196
|
+
debugProbes?: boolean;
|
|
3197
|
+
}
|
|
3198
|
+
interface SwoopConsoleEntry {
|
|
3199
|
+
level: 'debug' | 'info' | 'warn' | 'error' | 'log';
|
|
3200
|
+
message: string;
|
|
3201
|
+
args?: string[];
|
|
3202
|
+
time: number;
|
|
3203
|
+
}
|
|
3204
|
+
interface SwoopScriptError {
|
|
3205
|
+
stage: 'bootstrap' | 'script' | 'wait';
|
|
3206
|
+
scriptUrl?: string;
|
|
3207
|
+
message: string;
|
|
3208
|
+
stack?: string;
|
|
3209
|
+
}
|
|
3210
|
+
interface SwoopResult {
|
|
3211
|
+
/**
|
|
3212
|
+
* Final URL after redirects.
|
|
3213
|
+
*/
|
|
3214
|
+
url: string;
|
|
3215
|
+
/**
|
|
3216
|
+
* Snapshot HTML (best-effort).
|
|
3217
|
+
*/
|
|
3218
|
+
html: string;
|
|
3219
|
+
/**
|
|
3220
|
+
* Console output captured from the isolated execution environment.
|
|
3221
|
+
*/
|
|
3222
|
+
console: SwoopConsoleEntry[];
|
|
3223
|
+
/**
|
|
3224
|
+
* Script/bootstrap errors captured during execution.
|
|
3225
|
+
*/
|
|
3226
|
+
errors: SwoopScriptError[];
|
|
3227
|
+
/**
|
|
3228
|
+
* Timing metadata (ms).
|
|
3229
|
+
*/
|
|
3230
|
+
timing: {
|
|
3231
|
+
start: number;
|
|
3232
|
+
end: number;
|
|
3233
|
+
duration: number;
|
|
3234
|
+
};
|
|
3235
|
+
}
|
|
3236
|
+
|
|
3237
|
+
/**
|
|
3238
|
+
* Execute client-side JavaScript against a DOM-only environment and snapshot the resulting HTML.
|
|
3239
|
+
*
|
|
3240
|
+
* @remarks
|
|
3241
|
+
* **Experimental feature**.
|
|
3242
|
+
*
|
|
3243
|
+
* @remarks
|
|
3244
|
+
* - Default engine (`vm`) works on regular Node.js.
|
|
3245
|
+
*
|
|
3246
|
+
* This is *not* a real browser engine:
|
|
3247
|
+
* - No layout/paint/CSS correctness
|
|
3248
|
+
* - No true navigation lifecycle
|
|
3249
|
+
* - Best-effort shims for browser APIs
|
|
3250
|
+
*
|
|
3251
|
+
* ⚠️ **Security**: This executes third-party JavaScript. Only use on trusted sources or in an OS sandbox.
|
|
3252
|
+
*/
|
|
3253
|
+
declare function swoop(url: string | URL, init?: SwoopInit): Promise<SwoopResult>;
|
|
3254
|
+
|
|
3255
|
+
export { type AlternateLink, type AnalyticsMetadata, type AppLinks, type AppleTouchIcon, type Article, type AssetsMetadata, type CanonicalMetadata, type ConnectionHint, type ContentExtractionOptions, type ContentQuality, type ContentResult, type CopyrightMetadata, type DiscoveredFeed, type DublinCoreMetadata, type ExtractedContent, type ExtractedLink, type ExtractionErrorType, type ExtractionFailure, type Feed, type FeedAuthor, type FeedDiscoveryMetadata, type FeedEnclosure, type FeedFormat, type FeedItem, type GeoMetadata, type GeoPosition, type HTMLDocument, type HtmlToTextOptions, type IconsMetadata, type JsonLdBlock, type LanguageMetadata, type LinksExtractionOptions, type LinksMetadata, type MSTile, type MaskIcon, type MonetizationMetadata, type NewsMetadata, type OpenGraphArticle, type OpenGraphAudio, type OpenGraphBook, type OpenGraphImage, type OpenGraphMetadata, type OpenGraphProfile, type OpenGraphVideo, type PaginationMetadata, type ParseResult, PluckContentTypeError, PluckEncodingError, PluckError, PluckHttpError, type PluckInit, PluckNetworkError, PluckRedirectError, type PluckResponse, PluckSizeError, PluckTimeoutError, type PreloadResource, type RobotDirectives, type RobotsMetadata, type SEOMetadata, type SchemaOrgMetadata, type SecurityMetadata, type SitemapDiscoveryMetadata, type SocialProfilesMetadata, SwoopEnvironmentError, SwoopError, SwoopExecutionError, type SwoopInit, type SwoopResult, SwoopSecurityError, SwoopTimeoutError, type SwoopWaitStrategy, type TwitterApp, type TwitterAppPlatform, type TwitterCardMetadata, type TwitterPlayer, type VerificationMetadata, type Website, assessContentQuality, calculateReadingTime, countWords, detectFormat, extractAnalytics, extractAssets, extractCanonical, extractContent, extractCopyright, extractDublinCore, extractFeedDiscovery, extractGeo, extractIcons, extractLanguage, extractLinks, extractMonetization, extractNews, extractOpenGraph, extractPagination, extractRobots, extractSEO, extractSchemaOrg, extractSecurity, extractSitemapDiscovery, extractSocialProfiles, extractTwitterCard, extractVerification, gatherArticle, gatherFeed, gatherWebsite, htmlToText, isAtom, isFeed, isJSONFeed, isProbablyReaderable, isRSS, parseFeed, parseHTML, pluck, swoop };
|
package/dist/index.d.ts
CHANGED
|
@@ -3069,4 +3069,187 @@ declare class PluckContentTypeError extends PluckError {
|
|
|
3069
3069
|
*/
|
|
3070
3070
|
declare function pluck(input: string | URL | Request, init?: PluckInit): Promise<PluckResponse>;
|
|
3071
3071
|
|
|
3072
|
-
|
|
3072
|
+
/**
|
|
3073
|
+
* Base error type for `swoop()`.
|
|
3074
|
+
*/
|
|
3075
|
+
declare class SwoopError extends Error {
|
|
3076
|
+
name: string;
|
|
3077
|
+
}
|
|
3078
|
+
/**
|
|
3079
|
+
* Thrown when the current runtime cannot execute `swoop()`.
|
|
3080
|
+
*/
|
|
3081
|
+
declare class SwoopEnvironmentError extends SwoopError {
|
|
3082
|
+
name: string;
|
|
3083
|
+
}
|
|
3084
|
+
/**
|
|
3085
|
+
* Thrown when `swoop()` exceeds its configured timeout.
|
|
3086
|
+
*/
|
|
3087
|
+
declare class SwoopTimeoutError extends SwoopError {
|
|
3088
|
+
name: string;
|
|
3089
|
+
}
|
|
3090
|
+
/**
|
|
3091
|
+
* Thrown when script execution fails in a non-recoverable way.
|
|
3092
|
+
*/
|
|
3093
|
+
declare class SwoopExecutionError extends SwoopError {
|
|
3094
|
+
name: string;
|
|
3095
|
+
}
|
|
3096
|
+
/**
|
|
3097
|
+
* Thrown when `swoop()` is asked to execute potentially unsafe scripts
|
|
3098
|
+
* in a context where the caller should explicitly acknowledge the risk.
|
|
3099
|
+
*/
|
|
3100
|
+
declare class SwoopSecurityError extends SwoopError {
|
|
3101
|
+
name: string;
|
|
3102
|
+
}
|
|
3103
|
+
|
|
3104
|
+
/**
|
|
3105
|
+
* How `swoop()` decides when a client-rendered page is "done enough" to snapshot.
|
|
3106
|
+
*
|
|
3107
|
+
* @remarks
|
|
3108
|
+
* This is DOM-only rendering (no layout/paint). "Done" is best-effort.
|
|
3109
|
+
*/
|
|
3110
|
+
type SwoopWaitStrategy = 'timeout' | 'networkidle';
|
|
3111
|
+
type SwoopEngine = 'vm';
|
|
3112
|
+
interface SwoopInit {
|
|
3113
|
+
/**
|
|
3114
|
+
* Execution engine used for running third-party scripts.
|
|
3115
|
+
*
|
|
3116
|
+
* @remarks
|
|
3117
|
+
* - `vm` (default): practical, supports `fetch` by reusing host globals
|
|
3118
|
+
*
|
|
3119
|
+
* @defaultValue 'vm'
|
|
3120
|
+
*/
|
|
3121
|
+
engine?: SwoopEngine;
|
|
3122
|
+
/**
|
|
3123
|
+
* Pluck options used for the initial HTML request and external script fetching.
|
|
3124
|
+
*/
|
|
3125
|
+
pluck?: PluckInit;
|
|
3126
|
+
/**
|
|
3127
|
+
* Execute inline and external scripts found in the HTML.
|
|
3128
|
+
*
|
|
3129
|
+
* @defaultValue true
|
|
3130
|
+
*/
|
|
3131
|
+
executeScripts?: boolean;
|
|
3132
|
+
/**
|
|
3133
|
+
* Maximum time to wait for the page to "settle" before taking a snapshot.
|
|
3134
|
+
*
|
|
3135
|
+
* @defaultValue 3000
|
|
3136
|
+
*/
|
|
3137
|
+
timeout?: number;
|
|
3138
|
+
/**
|
|
3139
|
+
* Which waiting strategy to use.
|
|
3140
|
+
*
|
|
3141
|
+
* - `timeout`: sleep for `timeout` and snapshot
|
|
3142
|
+
* - `networkidle`: wait until no tracked fetches are pending for `idleTime`
|
|
3143
|
+
*
|
|
3144
|
+
* @defaultValue 'networkidle'
|
|
3145
|
+
*/
|
|
3146
|
+
waitStrategy?: SwoopWaitStrategy;
|
|
3147
|
+
/**
|
|
3148
|
+
* Required quiet period (ms) for `networkidle`.
|
|
3149
|
+
*
|
|
3150
|
+
* @defaultValue 250
|
|
3151
|
+
*/
|
|
3152
|
+
idleTime?: number;
|
|
3153
|
+
/**
|
|
3154
|
+
* Poll interval (ms) for `networkidle`.
|
|
3155
|
+
*
|
|
3156
|
+
* @defaultValue 25
|
|
3157
|
+
*/
|
|
3158
|
+
pollInterval?: number;
|
|
3159
|
+
/**
|
|
3160
|
+
* How many scripts to load/execute at most.
|
|
3161
|
+
*
|
|
3162
|
+
* @defaultValue 64
|
|
3163
|
+
*/
|
|
3164
|
+
maxScripts?: number;
|
|
3165
|
+
/**
|
|
3166
|
+
* If true, forward console output from the isolated realm to the host console.
|
|
3167
|
+
*
|
|
3168
|
+
* @defaultValue false
|
|
3169
|
+
*/
|
|
3170
|
+
forwardConsole?: boolean;
|
|
3171
|
+
/**
|
|
3172
|
+
* If true, installs permissive Proxy-based stubs for common missing browser APIs.
|
|
3173
|
+
*
|
|
3174
|
+
* @remarks
|
|
3175
|
+
* This may hide some failures (by turning hard crashes into no-ops), but improves
|
|
3176
|
+
* compatibility for a best-effort snapshotter.
|
|
3177
|
+
*
|
|
3178
|
+
* @defaultValue true
|
|
3179
|
+
*/
|
|
3180
|
+
permissiveShims?: boolean;
|
|
3181
|
+
/**
|
|
3182
|
+
* Record all sandbox `fetch()` calls into the captured console output.
|
|
3183
|
+
*
|
|
3184
|
+
* @defaultValue false
|
|
3185
|
+
*/
|
|
3186
|
+
debugFetch?: boolean;
|
|
3187
|
+
/**
|
|
3188
|
+
* Enable additional sandbox probes to help iterative shim development.
|
|
3189
|
+
*
|
|
3190
|
+
* @remarks
|
|
3191
|
+
* Collects lightweight runtime stats (DOM ops/mutations, app-root growth, etc.)
|
|
3192
|
+
* and emits them via captured console.
|
|
3193
|
+
*
|
|
3194
|
+
* @defaultValue false
|
|
3195
|
+
*/
|
|
3196
|
+
debugProbes?: boolean;
|
|
3197
|
+
}
|
|
3198
|
+
interface SwoopConsoleEntry {
|
|
3199
|
+
level: 'debug' | 'info' | 'warn' | 'error' | 'log';
|
|
3200
|
+
message: string;
|
|
3201
|
+
args?: string[];
|
|
3202
|
+
time: number;
|
|
3203
|
+
}
|
|
3204
|
+
interface SwoopScriptError {
|
|
3205
|
+
stage: 'bootstrap' | 'script' | 'wait';
|
|
3206
|
+
scriptUrl?: string;
|
|
3207
|
+
message: string;
|
|
3208
|
+
stack?: string;
|
|
3209
|
+
}
|
|
3210
|
+
interface SwoopResult {
|
|
3211
|
+
/**
|
|
3212
|
+
* Final URL after redirects.
|
|
3213
|
+
*/
|
|
3214
|
+
url: string;
|
|
3215
|
+
/**
|
|
3216
|
+
* Snapshot HTML (best-effort).
|
|
3217
|
+
*/
|
|
3218
|
+
html: string;
|
|
3219
|
+
/**
|
|
3220
|
+
* Console output captured from the isolated execution environment.
|
|
3221
|
+
*/
|
|
3222
|
+
console: SwoopConsoleEntry[];
|
|
3223
|
+
/**
|
|
3224
|
+
* Script/bootstrap errors captured during execution.
|
|
3225
|
+
*/
|
|
3226
|
+
errors: SwoopScriptError[];
|
|
3227
|
+
/**
|
|
3228
|
+
* Timing metadata (ms).
|
|
3229
|
+
*/
|
|
3230
|
+
timing: {
|
|
3231
|
+
start: number;
|
|
3232
|
+
end: number;
|
|
3233
|
+
duration: number;
|
|
3234
|
+
};
|
|
3235
|
+
}
|
|
3236
|
+
|
|
3237
|
+
/**
|
|
3238
|
+
* Execute client-side JavaScript against a DOM-only environment and snapshot the resulting HTML.
|
|
3239
|
+
*
|
|
3240
|
+
* @remarks
|
|
3241
|
+
* **Experimental feature**.
|
|
3242
|
+
*
|
|
3243
|
+
* @remarks
|
|
3244
|
+
* - Default engine (`vm`) works on regular Node.js.
|
|
3245
|
+
*
|
|
3246
|
+
* This is *not* a real browser engine:
|
|
3247
|
+
* - No layout/paint/CSS correctness
|
|
3248
|
+
* - No true navigation lifecycle
|
|
3249
|
+
* - Best-effort shims for browser APIs
|
|
3250
|
+
*
|
|
3251
|
+
* ⚠️ **Security**: This executes third-party JavaScript. Only use on trusted sources or in an OS sandbox.
|
|
3252
|
+
*/
|
|
3253
|
+
declare function swoop(url: string | URL, init?: SwoopInit): Promise<SwoopResult>;
|
|
3254
|
+
|
|
3255
|
+
export { type AlternateLink, type AnalyticsMetadata, type AppLinks, type AppleTouchIcon, type Article, type AssetsMetadata, type CanonicalMetadata, type ConnectionHint, type ContentExtractionOptions, type ContentQuality, type ContentResult, type CopyrightMetadata, type DiscoveredFeed, type DublinCoreMetadata, type ExtractedContent, type ExtractedLink, type ExtractionErrorType, type ExtractionFailure, type Feed, type FeedAuthor, type FeedDiscoveryMetadata, type FeedEnclosure, type FeedFormat, type FeedItem, type GeoMetadata, type GeoPosition, type HTMLDocument, type HtmlToTextOptions, type IconsMetadata, type JsonLdBlock, type LanguageMetadata, type LinksExtractionOptions, type LinksMetadata, type MSTile, type MaskIcon, type MonetizationMetadata, type NewsMetadata, type OpenGraphArticle, type OpenGraphAudio, type OpenGraphBook, type OpenGraphImage, type OpenGraphMetadata, type OpenGraphProfile, type OpenGraphVideo, type PaginationMetadata, type ParseResult, PluckContentTypeError, PluckEncodingError, PluckError, PluckHttpError, type PluckInit, PluckNetworkError, PluckRedirectError, type PluckResponse, PluckSizeError, PluckTimeoutError, type PreloadResource, type RobotDirectives, type RobotsMetadata, type SEOMetadata, type SchemaOrgMetadata, type SecurityMetadata, type SitemapDiscoveryMetadata, type SocialProfilesMetadata, SwoopEnvironmentError, SwoopError, SwoopExecutionError, type SwoopInit, type SwoopResult, SwoopSecurityError, SwoopTimeoutError, type SwoopWaitStrategy, type TwitterApp, type TwitterAppPlatform, type TwitterCardMetadata, type TwitterPlayer, type VerificationMetadata, type Website, assessContentQuality, calculateReadingTime, countWords, detectFormat, extractAnalytics, extractAssets, extractCanonical, extractContent, extractCopyright, extractDublinCore, extractFeedDiscovery, extractGeo, extractIcons, extractLanguage, extractLinks, extractMonetization, extractNews, extractOpenGraph, extractPagination, extractRobots, extractSEO, extractSchemaOrg, extractSecurity, extractSitemapDiscovery, extractSocialProfiles, extractTwitterCard, extractVerification, gatherArticle, gatherFeed, gatherWebsite, htmlToText, isAtom, isFeed, isJSONFeed, isProbablyReaderable, isRSS, parseFeed, parseHTML, pluck, swoop };
|