@pinkpixel/sugarstitch 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/CHANGELOG.md +59 -0
  2. package/LICENSE +21 -0
  3. package/OVERVIEW.md +306 -0
  4. package/README.md +462 -0
  5. package/assets/banner_dark.png +0 -0
  6. package/assets/banner_light.png +0 -0
  7. package/assets/logo.png +0 -0
  8. package/assets/screenshot_cli.png +0 -0
  9. package/assets/screenshot_completed.png +0 -0
  10. package/assets/screenshot_homepage.png +0 -0
  11. package/assets/screenshot_scraping.png +0 -0
  12. package/dist/index.js +216 -0
  13. package/dist/scraper.js +719 -0
  14. package/dist/server.js +1272 -0
  15. package/package.json +26 -0
  16. package/public/favicon.png +0 -0
  17. package/scripts/add-shebang.js +11 -0
  18. package/src/index.ts +217 -0
  19. package/src/scraper.ts +903 -0
  20. package/src/server.ts +1319 -0
  21. package/tsconfig.json +12 -0
  22. package/website/astro.config.mjs +5 -0
  23. package/website/package-lock.json +6358 -0
  24. package/website/package.json +18 -0
  25. package/website/public/banner_dark.png +0 -0
  26. package/website/public/banner_light.png +0 -0
  27. package/website/public/favicon.png +0 -0
  28. package/website/public/screenshot_cli.png +0 -0
  29. package/website/public/screenshot_completed.png +0 -0
  30. package/website/public/screenshot_homepage.png +0 -0
  31. package/website/public/screenshot_scraping.png +0 -0
  32. package/website/src/layouts/DocsLayout.astro +142 -0
  33. package/website/src/pages/docs/install.astro +96 -0
  34. package/website/src/pages/docs/use-the-app.astro +131 -0
  35. package/website/src/pages/index.astro +94 -0
  36. package/website/src/styles/site.css +611 -0
  37. package/website/tsconfig.json +3 -0
  38. package/website/wrangler.toml +6 -0
package/README.md ADDED
@@ -0,0 +1,462 @@
1
+ <p align="center">
2
+ <img src="assets/logo.png" alt="SugarStitch logo" width="300" height="300" />
3
+ </p>
4
+
5
+ # SugarStitch
6
+
7
+ SugarStitch is a TypeScript scraper for fiber arts pattern websites with both a CLI and a local browser UI. It can scrape individual pattern pages, batch lists of URLs, or discover pattern pages from an index page and then scrape those discovered links for titles, text, images, and PDFs.
8
+
9
+ ## Screenshots
10
+
11
+ ### Local UI
12
+
13
+ ![SugarStitch homepage UI](website/public/screenshot_homepage.png)
14
+
15
+ ![SugarStitch scraping progress state](website/public/screenshot_scraping.png)
16
+
17
+ ![SugarStitch completed run summary](website/public/screenshot_completed.png)
18
+
19
+ ### CLI
20
+
21
+ ![SugarStitch CLI](website/public/screenshot_cli.png)
22
+
23
+ ## What It Does
24
+
25
+ - Scrapes a single pattern URL or a list of URLs from a text file
26
+ - Includes a simple local browser UI for people who prefer forms over command-line flags
27
+ - Supports discovery crawl mode so one listing page can expand into many pattern pages
28
+ - Supports crawl language filtering so discovered pages can stay in one language
29
+ - Supports crawl pagination so listing pages like `/page/2/` and `/page/3/` can be added automatically
30
+ - Includes built-in selector presets for `generic`, `wordpress`, and `woocommerce`
31
+ - Supports reusable saved site profiles from a JSON config file
32
+ - Lets you override title, description, materials, instructions, and image selectors per run
33
+ - Includes a preview mode to test selectors before downloading files or writing JSON
34
+ - Lets you choose an output directory for the JSON file plus downloaded assets
35
+ - Shows an in-page loading state while preview or scrape requests are running
36
+ - Downloads linked PDFs and page images when found
37
+ - Skips already-known `sourceUrl` entries before re-scraping them
38
+
39
+ ## Best Supported Site Types
40
+
41
+ SugarStitch works best on sites where the pattern content is already present in the HTML response and does not require a JavaScript app to render first.
42
+
43
+ Typical use cases include:
44
+
45
+ - sewing pattern blogs
46
+ - crochet pattern pages
47
+ - knitting pattern archives
48
+ - quilting, embroidery, and other fiber arts tutorial or pattern sites
49
+
50
+ Usually a good fit:
51
+
52
+ - WordPress pattern blogs and article pages
53
+ - Blogger and Blogspot pattern pages
54
+ - WooCommerce product-style pattern pages
55
+ - older handcrafted sites with normal HTML articles
56
+ - free-pattern archive pages that link to regular child pages
57
+
58
+ More mixed or site-specific:
59
+
60
+ - Wix
61
+ - Squarespace
62
+ - Webflow
63
+ - custom JavaScript-heavy sites
64
+
65
+ Usually not a good fit with the current scraper approach:
66
+
67
+ - React single-page apps
68
+ - hash-routed sites like `#/free-patterns`
69
+ - pages where the content only appears after client-side JavaScript runs
70
+
71
+ Why:
72
+
73
+ SugarStitch currently fetches page HTML and parses it directly. It does not run a full browser-rendered scraping flow yet, so JavaScript-only pages may return just the site shell instead of the real pattern content.
74
+
75
+ If a site only partly works, try:
76
+
77
+ - switching selector presets
78
+ - using `Test Selectors` first
79
+ - creating a saved site profile
80
+ - adding one or two advanced selector overrides
81
+
82
+ ## Install
83
+
84
+ ### Global Install
85
+
86
+ ```bash
87
+ npm install -g @pinkpixel/sugarstitch
88
+ ```
89
+
90
+ Then run it as:
91
+
92
+ ```bash
93
+ sugarstitch --url "https://example.com/pattern"
94
+ ```
95
+
96
+ ### Local Development Install
97
+
98
+ ```bash
99
+ git clone https://github.com/pinkpixel-dev/sugarstitch.git
100
+ cd sugarstitch
101
+ npm install
102
+ ```
103
+
104
+ ## Available Scripts
105
+
106
+ ```bash
107
+ npm run build
108
+ ```
109
+
110
+ Compiles TypeScript into `dist/`.
111
+
112
+ ```bash
113
+ npm run scrape -- --url "https://example.com/pattern"
114
+ ```
115
+
116
+ Runs the CLI with `ts-node`.
117
+
118
+ ```bash
119
+ npm run ui
120
+ ```
121
+
122
+ Starts the local UI at `http://localhost:4177`.
123
+
124
+ ## Quick Start
125
+
126
+ ### Scrape One Pattern Page
127
+
128
+ ```bash
129
+ npm run scrape -- --url "https://example.com/pattern" --preset wordpress
130
+ ```
131
+
132
+ ### Scrape Many URLs From a File
133
+
134
+ Create `urls.txt`:
135
+
136
+ ```txt
137
+ https://example.com/pattern-1
138
+ https://example.com/pattern-2
139
+ https://example.com/pattern-3
140
+ ```
141
+
142
+ Then run:
143
+
144
+ ```bash
145
+ npm run scrape -- --file urls.txt
146
+ ```
147
+
148
+ ### Save Output Somewhere Else
149
+
150
+ ```bash
151
+ npm run scrape -- --url "https://example.com/pattern" --output-dir ./exports --output patterns.json
152
+ ```
153
+
154
+ That saves:
155
+
156
+ - `patterns.json`
157
+ - `images/`
158
+ - `pdfs/`
159
+ - `texts/`
160
+
161
+ inside `./exports`.
162
+
163
+ ## Discovery Crawl Mode
164
+
165
+ Discovery crawl mode is for index pages such as “Free Patterns” pages. Instead of entering every pattern URL yourself, you can start from one page and let SugarStitch follow links a couple levels deep before scraping the discovered pages.
166
+
167
+ This is useful for:
168
+
169
+ - free-pattern listing pages
170
+ - archive pages
171
+ - blog category pages
172
+ - collections where the real pattern content lives on child pages
173
+
174
+ ### Example
175
+
176
+ ```bash
177
+ npm run scrape -- \
178
+ --url "https://www.tildasworld.com/free-patterns/" \
179
+ --preset wordpress \
180
+ --crawl \
181
+ --crawl-depth 2 \
182
+ --crawl-pattern "free_pattern|pattern|quilt|pillow" \
183
+ --crawl-language english \
184
+ --crawl-paginate
185
+ ```
186
+
187
+ That tells SugarStitch to:
188
+
189
+ 1. Start from the given listing page
190
+ 2. Follow matching links up to 2 levels deep
191
+ 3. Stay on the same domain by default
192
+ 4. Scrape the discovered pages themselves
193
+
194
+ So if a child page is a blog-style pattern page with no PDF but useful article content, SugarStitch will still try to scrape that page normally.
195
+
196
+ ### Crawl Options
197
+
198
+ - `--crawl`: turns discovery mode on
199
+ - `--crawl-depth <number>`: how many link levels deep to follow
200
+ - `--crawl-pattern <pattern>`: only follow links whose URL or link text matches this text or regex
201
+ - `--crawl-language <language>`: prefer discovered URLs for one language such as `english`, `french`, or `portuguese`
202
+ - `--crawl-paginate`: expand paginated listing pages like `/page/2/`, `/page/3/`, and so on
203
+ - `--crawl-max-pages <number>`: cap how many listing pages are added in pagination mode
204
+ - `--crawl-any-domain`: allow discovery to follow links outside the starting domain
205
+ - `--crawl-max-urls <number>`: cap how many discovered pages get scraped
206
+
207
+ ### Why Crawl Language Filtering Helps
208
+
209
+ Some sites expose multiple language sections from the same listing page. For example, an English archive may also link to French or Portuguese archives. With `--crawl-language english`, SugarStitch can keep the discovered crawl focused on English pages instead of mixing languages into one run.
210
+
211
+ ### Why Crawl Pagination Helps
212
+
213
+ Some listing pages only expose the first batch of pattern cards until you click a `Load More` control. If the site also exposes those later batches as regular paginated URLs, SugarStitch can add those deeper listing pages automatically before discovery continues.
214
+
215
+ ## Local Web UI
216
+
217
+ Run:
218
+
219
+ ```bash
220
+ npm run ui
221
+ ```
222
+
223
+ Then open:
224
+
225
+ ```text
226
+ http://localhost:4177
227
+ ```
228
+
229
+ ![SugarStitch homepage showing the scrape form and saved profiles](website/public/screenshot_homepage.png)
230
+
231
+ The UI includes:
232
+
233
+ - single URL mode
234
+ - multi-URL paste mode
235
+ - saved site profile dropdown
236
+ - selector preset dropdown
237
+ - advanced selector override fields
238
+ - discovery crawl controls
239
+ - crawl language and crawl pagination controls
240
+ - output JSON filename field
241
+ - output directory field
242
+ - `Test Selectors` preview button
243
+ - `Start Scraping` button
244
+ - light and dark mode toggle
245
+ - spinner/progress overlay while requests are running
246
+
247
+ ![SugarStitch progress overlay while a scrape is running](website/public/screenshot_scraping.png)
248
+
249
+ ![SugarStitch completed run summary with log output](website/public/screenshot_completed.png)
250
+
251
+ ### Output Directory In the UI
252
+
253
+ Use the `Output Directory` field to choose where the JSON file and downloaded folders should be saved.
254
+
255
+ If left blank, SugarStitch saves into the project folder you launched it from.
256
+
257
+ Note:
258
+ This is currently a path field, not a native folder picker. In a normal browser-based local UI, the page cannot reliably hand a true local filesystem path back to the server the way a desktop app can.
259
+
260
+ ## Selector Presets
261
+
262
+ Selector presets are defined in [`src/scraper.ts`](/home/sizzlebop/PINKPIXEL/PROJECTS/CURRENT/sugarstitch/src/scraper.ts).
263
+
264
+ Built-in presets:
265
+
266
+ - `generic`: a broad fallback for custom and article-style pages
267
+ - `wordpress`: tuned for common WordPress post wrappers like `.entry-content`
268
+ - `woocommerce`: tuned for WooCommerce product pages and galleries
269
+
270
+ These are starting points, not guarantees.
271
+
272
+ ## Advanced Selector Overrides
273
+
274
+ If a preset is close but not quite right, you can override only the fields you need for a single run.
275
+
276
+ Available override flags:
277
+
278
+ - `--title-selector`
279
+ - `--description-selector`
280
+ - `--materials-selector`
281
+ - `--instructions-selector`
282
+ - `--image-selector`
283
+
284
+ Example:
285
+
286
+ ```bash
287
+ npm run scrape -- \
288
+ --url "https://example.com/pattern" \
289
+ --preset wordpress \
290
+ --materials-selector ".entry-content ul li"
291
+ ```
292
+
293
+ Overrides take priority over the selected preset for that field only.
294
+
295
+ ## Saved Site Profiles
296
+
297
+ SugarStitch can load reusable profiles from [`sugarstitch.profiles.json`](/home/sizzlebop/PINKPIXEL/PROJECTS/CURRENT/sugarstitch/sugarstitch.profiles.json).
298
+
299
+ Each profile can define:
300
+
301
+ - `id`
302
+ - `label`
303
+ - `description`
304
+ - `preset`
305
+ - `selectorOverrides`
306
+
307
+ Example:
308
+
309
+ ```json
310
+ {
311
+ "profiles": [
312
+ {
313
+ "id": "tildas-world",
314
+ "label": "Tilda's World",
315
+ "preset": "wordpress",
316
+ "selectorOverrides": {
317
+ "materialsSelector": ".entry-content ul li",
318
+ "instructionsSelector": ".entry-content ol li"
319
+ }
320
+ }
321
+ ]
322
+ }
323
+ ```
324
+
325
+ Use one with:
326
+
327
+ ```bash
328
+ npm run scrape -- --url "https://example.com/pattern" --profile tildas-world
329
+ ```
330
+
331
+ Or point to another file:
332
+
333
+ ```bash
334
+ npm run scrape -- --url "https://example.com/pattern" --profile tildas-world --profiles-file ./my-profiles.json
335
+ ```
336
+
337
+ ## Preview Mode
338
+
339
+ Preview mode lets you test extraction before writing JSON or downloading files.
340
+
341
+ It:
342
+
343
+ - fetches the page
344
+ - applies the selected preset, saved profile, and any advanced overrides
345
+ - shows the matched title, description, materials, instructions, images, and PDFs
346
+ - does not write files
347
+
348
+ CLI example:
349
+
350
+ ```bash
351
+ npm run scrape -- --url "https://example.com/pattern" --profile tildas-world --preview
352
+ ```
353
+
354
+ UI flow:
355
+
356
+ 1. Choose `Single URL`
357
+ 2. Enter a pattern page URL
358
+ 3. Pick a preset or saved profile
359
+ 4. Add overrides if needed
360
+ 5. Click `Test Selectors`
361
+
362
+ ## CLI Options
363
+
364
+ ```text
365
+ -u, --url <url> A single URL of the pattern page to scrape
366
+ -f, --file <file> A text file containing a list of URLs
367
+ -o, --output <path> Output JSON file name
368
+ --output-dir <path> Directory where JSON, images, and PDFs should be saved
369
+ -p, --preset <preset> Selector preset
370
+ --crawl Discover links from the starting URL(s) before scraping them
371
+ --crawl-depth <number> How many link levels deep to follow in crawl mode
372
+ --crawl-pattern <pattern> Only follow discovered links whose URL or link text matches this text or regex
373
+ --crawl-language <language> Prefer discovered URLs for one language such as english, french, or portuguese
374
+ --crawl-paginate Expand listing pages like /page/2/, /page/3/, and scrape them too
375
+ --crawl-max-pages <number> Maximum listing pages to add in pagination mode
376
+ --crawl-any-domain Allow crawl mode to follow links to other domains
377
+ --crawl-max-urls <number> Maximum number of discovered page URLs to scrape
378
+ --profile <id> Use a saved site profile
379
+ --profiles-file <path> Path to the profiles config file
380
+ --preview Preview extraction without saving files
381
+ --title-selector <selector>
382
+ --description-selector <selector>
383
+ --materials-selector <selector>
384
+ --instructions-selector <selector>
385
+ --image-selector <selector>
386
+ ```
387
+
388
+ ## Output Structure
389
+
390
+ SugarStitch writes one object per successfully scraped page:
391
+
392
+ ```json
393
+ {
394
+ "title": "Pattern Title",
395
+ "description": "Short description from the page",
396
+ "materials": ["Cotton fabric", "Stuffing", "Thread"],
397
+ "instructions": ["Cut the pieces", "Sew the body", "Stuff and close"],
398
+ "sourceUrl": "https://example.com/pattern",
399
+ "localImages": ["images/pattern_title/image_1.jpg"],
400
+ "localPdfs": ["pdfs/pattern_title/pattern.pdf"],
401
+ "localTextFile": "texts/pattern_title/pattern.txt"
402
+ }
403
+ ```
404
+
405
+ Each scraped page also gets a plain-text artifact at `texts/<pattern_title>/pattern.txt`.
406
+
407
+ That text file includes:
408
+
409
+ - title
410
+ - source URL
411
+ - selected preset and optional profile
412
+ - extracted description
413
+ - extracted materials list
414
+ - extracted instructions list
415
+ - a fuller page text block gathered from the article content
416
+
417
+ ## Notes
418
+
419
+ - The CLI prints a small SugarStitch ASCII banner when run in a normal terminal.
420
+ - The local UI now includes a light/dark mode toggle, with light mode as the default.
421
+
422
+ ![SugarStitch CLI banner and progress output](website/public/screenshot_cli.png)
423
+
424
+ ## Troubleshooting
425
+
426
+ ### It scraped PDFs and titles, but not much else
427
+
428
+ That still counts as a successful scrape. It usually means the page-level selectors for description, materials, instructions, or images do not match the site structure yet.
429
+
430
+ Try one of these:
431
+
432
+ - run `Test Selectors` in the UI first
433
+ - switch presets
434
+ - use a saved profile for that site
435
+ - add one or two advanced overrides
436
+
437
+ ### Discovery crawl found too much or too little
438
+
439
+ Adjust:
440
+
441
+ - `crawl depth`
442
+ - `crawl pattern`
443
+ - `crawl language`
444
+ - crawl pagination settings
445
+ - same-domain restriction
446
+ - max discovered URLs
447
+
448
+ ### The output file already exists but the scraper refuses to run
449
+
450
+ If the JSON file contains invalid JSON, SugarStitch will stop instead of silently overwriting it. Fix or remove the broken file first.
451
+
452
+ ## Development Notes
453
+
454
+ - CLI entrypoint: [`src/index.ts`](/home/sizzlebop/PINKPIXEL/PROJECTS/CURRENT/sugarstitch/src/index.ts)
455
+ - UI entrypoint: [`src/server.ts`](/home/sizzlebop/PINKPIXEL/PROJECTS/CURRENT/sugarstitch/src/server.ts)
456
+ - Shared scraper logic: [`src/scraper.ts`](/home/sizzlebop/PINKPIXEL/PROJECTS/CURRENT/sugarstitch/src/scraper.ts)
457
+ - Starter profiles config: [`sugarstitch.profiles.json`](/home/sizzlebop/PINKPIXEL/PROJECTS/CURRENT/sugarstitch/sugarstitch.profiles.json)
458
+ - Technical overview: [`OVERVIEW.md`](/home/sizzlebop/PINKPIXEL/PROJECTS/CURRENT/sugarstitch/OVERVIEW.md)
459
+
460
+ ## License
461
+
462
+ This project is licensed under the MIT License. See [`LICENSE`](/home/sizzlebop/PINKPIXEL/PROJECTS/CURRENT/sugarstitch/LICENSE).
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
package/dist/index.js ADDED
@@ -0,0 +1,216 @@
1
+ #!/usr/bin/env node
2
+ "use strict";
3
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
4
+ if (k2 === undefined) k2 = k;
5
+ var desc = Object.getOwnPropertyDescriptor(m, k);
6
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
7
+ desc = { enumerable: true, get: function() { return m[k]; } };
8
+ }
9
+ Object.defineProperty(o, k2, desc);
10
+ }) : (function(o, m, k, k2) {
11
+ if (k2 === undefined) k2 = k;
12
+ o[k2] = m[k];
13
+ }));
14
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
15
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
16
+ }) : function(o, v) {
17
+ o["default"] = v;
18
+ });
19
+ var __importStar = (this && this.__importStar) || (function () {
20
+ var ownKeys = function(o) {
21
+ ownKeys = Object.getOwnPropertyNames || function (o) {
22
+ var ar = [];
23
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
24
+ return ar;
25
+ };
26
+ return ownKeys(o);
27
+ };
28
+ return function (mod) {
29
+ if (mod && mod.__esModule) return mod;
30
+ var result = {};
31
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
32
+ __setModuleDefault(result, mod);
33
+ return result;
34
+ };
35
+ })();
36
+ Object.defineProperty(exports, "__esModule", { value: true });
37
+ const commander_1 = require("commander");
38
+ const fs = __importStar(require("fs/promises"));
39
+ const path = __importStar(require("path"));
40
+ const scraper_1 = require("./scraper");
41
+ const program = new commander_1.Command();
42
+ program
43
+ .name('sugarstitch')
44
+ .description('✨ Bulk scrape fiber arts patterns, images, AND PDFs into sweet little local files ✨')
45
+ .version('1.0.0')
46
+ .option('-u, --url <url>', 'A single URL of the pattern page to scrape')
47
+ .option('-f, --file <file>', 'A text file containing a list of URLs (one per line)')
48
+ .option('-o, --output <path>', 'Output JSON file name', 'pattern-data.json')
49
+ .option('--output-dir <path>', 'Directory where JSON, images, and PDFs should be saved')
50
+ .option('-p, --preset <preset>', `Selector preset: ${(0, scraper_1.getSelectorPresets)().map(preset => preset.id).join(', ')}`, 'generic')
51
+ .option('--crawl', 'Discover links from the starting URL(s) before scraping them')
52
+ .option('--crawl-depth <number>', 'How many link levels deep to follow in crawl mode', '2')
53
+ .option('--crawl-pattern <pattern>', 'Only follow discovered links whose URL or link text matches this text or regex')
54
+ .option('--crawl-language <language>', 'Prefer discovered URLs for one language such as english, french, or portuguese')
55
+ .option('--crawl-paginate', 'Expand listing pages like /page/2/, /page/3/, and scrape them too')
56
+ .option('--crawl-max-pages <number>', 'Maximum listing pages to add in pagination mode', '20')
57
+ .option('--crawl-any-domain', 'Allow crawl mode to follow links to other domains')
58
+ .option('--crawl-max-urls <number>', 'Maximum number of discovered page URLs to scrape', '100')
59
+ .option('--profile <id>', 'Use a saved site profile from the profiles config file')
60
+ .option('--profiles-file <path>', `Path to the site profiles config file (default: ${scraper_1.DEFAULT_PROFILES_FILE})`, scraper_1.DEFAULT_PROFILES_FILE)
61
+ .option('--preview', 'Preview what would be extracted without downloading files or writing JSON')
62
+ .option('--title-selector <selector>', 'Override the title selector for this run')
63
+ .option('--description-selector <selector>', 'Override the description selector for this run')
64
+ .option('--materials-selector <selector>', 'Override the materials selector for this run')
65
+ .option('--instructions-selector <selector>', 'Override the instructions selector for this run')
66
+ .option('--image-selector <selector>', 'Override the image selector for this run')
67
+ .parse(process.argv);
68
+ const options = program.opts();
69
+ const ANSI_RESET = '\x1b[0m';
70
+ const ANSI_PINK = '\x1b[38;5;205m';
71
+ const ANSI_MINT = '\x1b[38;5;121m';
72
+ const ANSI_SKY = '\x1b[38;5;117m';
73
+ const ANSI_GOLD = '\x1b[38;5;223m';
74
+ function colorize(line, color) {
75
+ return `${color}${line}${ANSI_RESET}`;
76
+ }
77
+ function printBanner() {
78
+ if (!process.stdout.isTTY || process.env.NO_COLOR) {
79
+ console.log('\nSugarStitch\n');
80
+ return;
81
+ }
82
+ const bannerLines = [
83
+ colorize('███████╗██╗ ██╗ ██████╗ █████╗ ██████╗ ', ANSI_PINK),
84
+ colorize('██╔════╝██║ ██║██╔════╝ ██╔══██╗██╔══██╗', ANSI_MINT),
85
+ colorize('███████╗██║ ██║██║ ███╗███████║██████╔╝', ANSI_SKY),
86
+ colorize('╚════██║██║ ██║██║ ██║██╔══██║██╔══██╗', ANSI_GOLD),
87
+ colorize('███████║╚██████╔╝╚██████╔╝██║ ██║██║ ██║', ANSI_PINK),
88
+ colorize('╚══════╝ ╚═════╝ ╚═════╝ ╚═╝ ╚═╝╚═╝ ╚═╝', ANSI_MINT),
89
+ colorize(' ███████╗████████╗██╗████████╗ ██████╗██╗ ██╗', ANSI_SKY),
90
+ colorize(' ██╔════╝╚══██╔══╝██║╚══██╔══╝██╔════╝██║ ██║', ANSI_GOLD),
91
+ colorize(' ███████╗ ██║ ██║ ██║ ██║ ███████║', ANSI_PINK),
92
+ colorize(' ╚════██║ ██║ ██║ ██║ ██║ ██╔══██║', ANSI_MINT),
93
+ colorize(' ███████║ ██║ ██║ ██║ ╚██████╗██║ ██║', ANSI_SKY),
94
+ colorize(' ╚══════╝ ╚═╝ ╚═╝ ╚═╝ ╚═════╝╚═╝ ╚═╝', ANSI_GOLD)
95
+ ];
96
+ console.log(`\n${bannerLines.join('\n')}`);
97
+ console.log(colorize('Sweet little fiber arts scraper', ANSI_GOLD));
98
+ console.log('');
99
+ }
100
+ function validateInputOptions() {
101
+ if (options.url && options.file) {
102
+ console.error('\n❌ Please use either --url or --file, not both at the same time.');
103
+ process.exitCode = 1;
104
+ program.help();
105
+ }
106
+ if (!options.url && !options.file) {
107
+ console.error('\n❌ You need to provide either a single URL (-u) or a text file (-f) to scrape.');
108
+ process.exitCode = 1;
109
+ program.help();
110
+ }
111
+ if (!(0, scraper_1.isSelectorPresetId)(options.preset)) {
112
+ console.error(`\n❌ Unknown preset "${options.preset}". Use one of: ${(0, scraper_1.getSelectorPresets)().map(preset => preset.id).join(', ')}`);
113
+ process.exitCode = 1;
114
+ program.help();
115
+ }
116
+ }
117
+ function resolveOutputPaths(outputName, outputDirectory) {
118
+ const resolvedOutputDirectory = outputDirectory
119
+ ? path.resolve(process.cwd(), outputDirectory)
120
+ : process.cwd();
121
+ const outputPath = path.isAbsolute(outputName)
122
+ ? outputName
123
+ : path.resolve(resolvedOutputDirectory, outputName);
124
+ return {
125
+ outputDirectory: resolvedOutputDirectory,
126
+ outputPath
127
+ };
128
+ }
129
+ async function getUrlsFromOptions() {
130
+ if (options.url) {
131
+ const normalizedUrl = (0, scraper_1.normalizeUrl)(options.url);
132
+ if (!normalizedUrl) {
133
+ throw new Error(`That doesn't look like a valid URL: ${options.url}`);
134
+ }
135
+ return [normalizedUrl];
136
+ }
137
+ const filePath = path.resolve(process.cwd(), options.file);
138
+ const fileContent = await fs.readFile(filePath, 'utf-8');
139
+ const rawLines = fileContent.split(/\r?\n/).map(line => line.trim()).filter(line => line.length > 0);
140
+ const validUrls = rawLines
141
+ .map(scraper_1.normalizeUrl)
142
+ .filter((line) => Boolean(line));
143
+ const invalidCount = rawLines.length - validUrls.length;
144
+ const urls = (0, scraper_1.dedupeStrings)(validUrls);
145
+ console.log(`\n📚 Fuck yeah, loaded ${urls.length} URLs from ${options.file}. Let's get to work...`);
146
+ if (invalidCount > 0) {
147
+ console.log(`⚠️ Skipped ${invalidCount} line(s) because they were not valid http(s) URLs.`);
148
+ }
149
+ return urls;
150
+ }
151
+ async function run() {
152
+ printBanner();
153
+ validateInputOptions();
154
+ try {
155
+ const urls = await getUrlsFromOptions();
156
+ const profilesPath = path.resolve(process.cwd(), options.profilesFile);
157
+ const { outputDirectory, outputPath } = resolveOutputPaths(options.output, options.outputDir);
158
+ const selectorOverrides = (0, scraper_1.sanitizeSelectorOverrides)({
159
+ titleSelector: options.titleSelector,
160
+ descriptionSelector: options.descriptionSelector,
161
+ materialsSelector: options.materialsSelector,
162
+ instructionsSelector: options.instructionsSelector,
163
+ imageSelector: options.imageSelector
164
+ });
165
+ if (options.preview) {
166
+ const preview = await (0, scraper_1.previewPattern)({
167
+ url: urls[0],
168
+ preset: options.preset,
169
+ selectorOverrides,
170
+ profileId: options.profile,
171
+ profilesPath
172
+ }, message => console.log(message));
173
+ console.log('\nPreview Summary');
174
+ console.log(`Title: ${preview.title}`);
175
+ console.log(`Description: ${preview.description}`);
176
+ console.log(`Preset: ${preview.presetLabel}`);
177
+ if (preview.profileLabel) {
178
+ console.log(`Profile: ${preview.profileLabel}`);
179
+ }
180
+ if (preview.materials.length > 0) {
181
+ console.log(`Materials (${preview.materials.length}): ${preview.materials.join(' | ')}`);
182
+ }
183
+ if (preview.instructions.length > 0) {
184
+ console.log(`Instructions (${preview.instructions.length}): ${preview.instructions.slice(0, 5).join(' | ')}`);
185
+ }
186
+ console.log(`Images found: ${preview.imageUrls.length}`);
187
+ console.log(`PDFs found: ${preview.pdfUrls.length}`);
188
+ return;
189
+ }
190
+ await (0, scraper_1.scrapeUrls)({
191
+ urls,
192
+ outputPath,
193
+ preset: options.preset,
194
+ profileId: options.profile,
195
+ profilesPath,
196
+ selectorOverrides,
197
+ crawl: {
198
+ enabled: Boolean(options.crawl),
199
+ maxDepth: Number.parseInt(options.crawlDepth, 10),
200
+ sameDomainOnly: !options.crawlAnyDomain,
201
+ linkPattern: options.crawlPattern,
202
+ maxDiscoveredUrls: Number.parseInt(options.crawlMaxUrls, 10),
203
+ language: options.crawlLanguage,
204
+ paginate: Boolean(options.crawlPaginate),
205
+ maxPaginationPages: Number.parseInt(options.crawlMaxPages, 10)
206
+ },
207
+ workingDirectory: outputDirectory,
208
+ logger: message => console.log(message)
209
+ });
210
+ }
211
+ catch (error) {
212
+ console.error(`\n❌ ${error.message}`);
213
+ process.exitCode = 1;
214
+ }
215
+ }
216
+ run();