@pinkpixel/sugarstitch 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +59 -0
- package/LICENSE +21 -0
- package/OVERVIEW.md +306 -0
- package/README.md +462 -0
- package/assets/banner_dark.png +0 -0
- package/assets/banner_light.png +0 -0
- package/assets/logo.png +0 -0
- package/assets/screenshot_cli.png +0 -0
- package/assets/screenshot_completed.png +0 -0
- package/assets/screenshot_homepage.png +0 -0
- package/assets/screenshot_scraping.png +0 -0
- package/dist/index.js +216 -0
- package/dist/scraper.js +719 -0
- package/dist/server.js +1272 -0
- package/package.json +26 -0
- package/public/favicon.png +0 -0
- package/scripts/add-shebang.js +11 -0
- package/src/index.ts +217 -0
- package/src/scraper.ts +903 -0
- package/src/server.ts +1319 -0
- package/tsconfig.json +12 -0
- package/website/astro.config.mjs +5 -0
- package/website/package-lock.json +6358 -0
- package/website/package.json +18 -0
- package/website/public/banner_dark.png +0 -0
- package/website/public/banner_light.png +0 -0
- package/website/public/favicon.png +0 -0
- package/website/public/screenshot_cli.png +0 -0
- package/website/public/screenshot_completed.png +0 -0
- package/website/public/screenshot_homepage.png +0 -0
- package/website/public/screenshot_scraping.png +0 -0
- package/website/src/layouts/DocsLayout.astro +142 -0
- package/website/src/pages/docs/install.astro +96 -0
- package/website/src/pages/docs/use-the-app.astro +131 -0
- package/website/src/pages/index.astro +94 -0
- package/website/src/styles/site.css +611 -0
- package/website/tsconfig.json +3 -0
- package/website/wrangler.toml +6 -0
package/package.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@pinkpixel/sugarstitch",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "A CLI tool to scrape fiber arts patterns",
|
|
5
|
+
"main": "dist/index.js",
|
|
6
|
+
"bin": {
|
|
7
|
+
"sugarstitch": "dist/index.js"
|
|
8
|
+
},
|
|
9
|
+
"license": "MIT",
|
|
10
|
+
"scripts": {
|
|
11
|
+
"build": "tsc && node scripts/add-shebang.js",
|
|
12
|
+
"start": "ts-node src/index.ts",
|
|
13
|
+
"scrape": "ts-node src/index.ts",
|
|
14
|
+
"ui": "ts-node src/server.ts"
|
|
15
|
+
},
|
|
16
|
+
"dependencies": {
|
|
17
|
+
"axios": "^1.6.8",
|
|
18
|
+
"cheerio": "^1.0.0-rc.12",
|
|
19
|
+
"commander": "^12.0.0"
|
|
20
|
+
},
|
|
21
|
+
"devDependencies": {
|
|
22
|
+
"@types/node": "^20.11.30",
|
|
23
|
+
"ts-node": "^10.9.2",
|
|
24
|
+
"typescript": "^5.4.3"
|
|
25
|
+
}
|
|
26
|
+
}
|
|
Binary file
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
const fs = require('fs');
|
|
2
|
+
const path = require('path');
|
|
3
|
+
|
|
4
|
+
const targetPath = path.resolve(__dirname, '..', 'dist', 'index.js');
|
|
5
|
+
const shebang = '#!/usr/bin/env node\n';
|
|
6
|
+
|
|
7
|
+
const fileContent = fs.readFileSync(targetPath, 'utf8');
|
|
8
|
+
|
|
9
|
+
if (!fileContent.startsWith(shebang)) {
|
|
10
|
+
fs.writeFileSync(targetPath, `${shebang}${fileContent}`, 'utf8');
|
|
11
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import { Command } from 'commander';
|
|
4
|
+
import * as fs from 'fs/promises';
|
|
5
|
+
import * as path from 'path';
|
|
6
|
+
import {
|
|
7
|
+
DEFAULT_PROFILES_FILE,
|
|
8
|
+
normalizeUrl,
|
|
9
|
+
dedupeStrings,
|
|
10
|
+
scrapeUrls,
|
|
11
|
+
previewPattern,
|
|
12
|
+
getSelectorPresets,
|
|
13
|
+
isSelectorPresetId,
|
|
14
|
+
sanitizeSelectorOverrides,
|
|
15
|
+
type SelectorPresetId
|
|
16
|
+
} from './scraper';
|
|
17
|
+
|
|
18
|
+
const program = new Command();
|
|
19
|
+
|
|
20
|
+
program
|
|
21
|
+
.name('sugarstitch')
|
|
22
|
+
.description('✨ Bulk scrape fiber arts patterns, images, AND PDFs into sweet little local files ✨')
|
|
23
|
+
.version('1.0.0')
|
|
24
|
+
.option('-u, --url <url>', 'A single URL of the pattern page to scrape')
|
|
25
|
+
.option('-f, --file <file>', 'A text file containing a list of URLs (one per line)')
|
|
26
|
+
.option('-o, --output <path>', 'Output JSON file name', 'pattern-data.json')
|
|
27
|
+
.option('--output-dir <path>', 'Directory where JSON, images, and PDFs should be saved')
|
|
28
|
+
.option('-p, --preset <preset>', `Selector preset: ${getSelectorPresets().map(preset => preset.id).join(', ')}`, 'generic')
|
|
29
|
+
.option('--crawl', 'Discover links from the starting URL(s) before scraping them')
|
|
30
|
+
.option('--crawl-depth <number>', 'How many link levels deep to follow in crawl mode', '2')
|
|
31
|
+
.option('--crawl-pattern <pattern>', 'Only follow discovered links whose URL or link text matches this text or regex')
|
|
32
|
+
.option('--crawl-language <language>', 'Prefer discovered URLs for one language such as english, french, or portuguese')
|
|
33
|
+
.option('--crawl-paginate', 'Expand listing pages like /page/2/, /page/3/, and scrape them too')
|
|
34
|
+
.option('--crawl-max-pages <number>', 'Maximum listing pages to add in pagination mode', '20')
|
|
35
|
+
.option('--crawl-any-domain', 'Allow crawl mode to follow links to other domains')
|
|
36
|
+
.option('--crawl-max-urls <number>', 'Maximum number of discovered page URLs to scrape', '100')
|
|
37
|
+
.option('--profile <id>', 'Use a saved site profile from the profiles config file')
|
|
38
|
+
.option('--profiles-file <path>', `Path to the site profiles config file (default: ${DEFAULT_PROFILES_FILE})`, DEFAULT_PROFILES_FILE)
|
|
39
|
+
.option('--preview', 'Preview what would be extracted without downloading files or writing JSON')
|
|
40
|
+
.option('--title-selector <selector>', 'Override the title selector for this run')
|
|
41
|
+
.option('--description-selector <selector>', 'Override the description selector for this run')
|
|
42
|
+
.option('--materials-selector <selector>', 'Override the materials selector for this run')
|
|
43
|
+
.option('--instructions-selector <selector>', 'Override the instructions selector for this run')
|
|
44
|
+
.option('--image-selector <selector>', 'Override the image selector for this run')
|
|
45
|
+
.parse(process.argv);
|
|
46
|
+
|
|
47
|
+
const options = program.opts();
|
|
48
|
+
|
|
49
|
+
const ANSI_RESET = '\x1b[0m';
|
|
50
|
+
const ANSI_PINK = '\x1b[38;5;205m';
|
|
51
|
+
const ANSI_MINT = '\x1b[38;5;121m';
|
|
52
|
+
const ANSI_SKY = '\x1b[38;5;117m';
|
|
53
|
+
const ANSI_GOLD = '\x1b[38;5;223m';
|
|
54
|
+
|
|
55
|
+
function colorize(line: string, color: string): string {
|
|
56
|
+
return `${color}${line}${ANSI_RESET}`;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
function printBanner(): void {
|
|
60
|
+
if (!process.stdout.isTTY || process.env.NO_COLOR) {
|
|
61
|
+
console.log('\nSugarStitch\n');
|
|
62
|
+
return;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
const bannerLines = [
|
|
66
|
+
colorize('███████╗██╗ ██╗ ██████╗ █████╗ ██████╗ ', ANSI_PINK),
|
|
67
|
+
colorize('██╔════╝██║ ██║██╔════╝ ██╔══██╗██╔══██╗', ANSI_MINT),
|
|
68
|
+
colorize('███████╗██║ ██║██║ ███╗███████║██████╔╝', ANSI_SKY),
|
|
69
|
+
colorize('╚════██║██║ ██║██║ ██║██╔══██║██╔══██╗', ANSI_GOLD),
|
|
70
|
+
colorize('███████║╚██████╔╝╚██████╔╝██║ ██║██║ ██║', ANSI_PINK),
|
|
71
|
+
colorize('╚══════╝ ╚═════╝ ╚═════╝ ╚═╝ ╚═╝╚═╝ ╚═╝', ANSI_MINT),
|
|
72
|
+
colorize(' ███████╗████████╗██╗████████╗ ██████╗██╗ ██╗', ANSI_SKY),
|
|
73
|
+
colorize(' ██╔════╝╚══██╔══╝██║╚══██╔══╝██╔════╝██║ ██║', ANSI_GOLD),
|
|
74
|
+
colorize(' ███████╗ ██║ ██║ ██║ ██║ ███████║', ANSI_PINK),
|
|
75
|
+
colorize(' ╚════██║ ██║ ██║ ██║ ██║ ██╔══██║', ANSI_MINT),
|
|
76
|
+
colorize(' ███████║ ██║ ██║ ██║ ╚██████╗██║ ██║', ANSI_SKY),
|
|
77
|
+
colorize(' ╚══════╝ ╚═╝ ╚═╝ ╚═╝ ╚═════╝╚═╝ ╚═╝', ANSI_GOLD)
|
|
78
|
+
];
|
|
79
|
+
|
|
80
|
+
console.log(`\n${bannerLines.join('\n')}`);
|
|
81
|
+
console.log(colorize('Sweet little fiber arts scraper', ANSI_GOLD));
|
|
82
|
+
console.log('');
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
function validateInputOptions(): void {
|
|
86
|
+
if (options.url && options.file) {
|
|
87
|
+
console.error('\n❌ Please use either --url or --file, not both at the same time.');
|
|
88
|
+
process.exitCode = 1;
|
|
89
|
+
program.help();
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
if (!options.url && !options.file) {
|
|
93
|
+
console.error('\n❌ You need to provide either a single URL (-u) or a text file (-f) to scrape.');
|
|
94
|
+
process.exitCode = 1;
|
|
95
|
+
program.help();
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
if (!isSelectorPresetId(options.preset)) {
|
|
99
|
+
console.error(`\n❌ Unknown preset "${options.preset}". Use one of: ${getSelectorPresets().map(preset => preset.id).join(', ')}`);
|
|
100
|
+
process.exitCode = 1;
|
|
101
|
+
program.help();
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
function resolveOutputPaths(outputName: string, outputDirectory?: string): { outputDirectory: string; outputPath: string } {
|
|
106
|
+
const resolvedOutputDirectory = outputDirectory
|
|
107
|
+
? path.resolve(process.cwd(), outputDirectory)
|
|
108
|
+
: process.cwd();
|
|
109
|
+
const outputPath = path.isAbsolute(outputName)
|
|
110
|
+
? outputName
|
|
111
|
+
: path.resolve(resolvedOutputDirectory, outputName);
|
|
112
|
+
|
|
113
|
+
return {
|
|
114
|
+
outputDirectory: resolvedOutputDirectory,
|
|
115
|
+
outputPath
|
|
116
|
+
};
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
async function getUrlsFromOptions(): Promise<string[]> {
|
|
120
|
+
if (options.url) {
|
|
121
|
+
const normalizedUrl = normalizeUrl(options.url);
|
|
122
|
+
|
|
123
|
+
if (!normalizedUrl) {
|
|
124
|
+
throw new Error(`That doesn't look like a valid URL: ${options.url}`);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
return [normalizedUrl];
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
const filePath = path.resolve(process.cwd(), options.file);
|
|
131
|
+
const fileContent = await fs.readFile(filePath, 'utf-8');
|
|
132
|
+
const rawLines = fileContent.split(/\r?\n/).map(line => line.trim()).filter(line => line.length > 0);
|
|
133
|
+
const validUrls = rawLines
|
|
134
|
+
.map(normalizeUrl)
|
|
135
|
+
.filter((line): line is string => Boolean(line));
|
|
136
|
+
const invalidCount = rawLines.length - validUrls.length;
|
|
137
|
+
const urls = dedupeStrings(validUrls);
|
|
138
|
+
|
|
139
|
+
console.log(`\n📚 Fuck yeah, loaded ${urls.length} URLs from ${options.file}. Let's get to work...`);
|
|
140
|
+
|
|
141
|
+
if (invalidCount > 0) {
|
|
142
|
+
console.log(`⚠️ Skipped ${invalidCount} line(s) because they were not valid http(s) URLs.`);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
return urls;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
async function run() {
|
|
149
|
+
printBanner();
|
|
150
|
+
validateInputOptions();
|
|
151
|
+
|
|
152
|
+
try {
|
|
153
|
+
const urls = await getUrlsFromOptions();
|
|
154
|
+
const profilesPath = path.resolve(process.cwd(), options.profilesFile);
|
|
155
|
+
const { outputDirectory, outputPath } = resolveOutputPaths(options.output, options.outputDir);
|
|
156
|
+
const selectorOverrides = sanitizeSelectorOverrides({
|
|
157
|
+
titleSelector: options.titleSelector,
|
|
158
|
+
descriptionSelector: options.descriptionSelector,
|
|
159
|
+
materialsSelector: options.materialsSelector,
|
|
160
|
+
instructionsSelector: options.instructionsSelector,
|
|
161
|
+
imageSelector: options.imageSelector
|
|
162
|
+
});
|
|
163
|
+
|
|
164
|
+
if (options.preview) {
|
|
165
|
+
const preview = await previewPattern({
|
|
166
|
+
url: urls[0],
|
|
167
|
+
preset: options.preset as SelectorPresetId,
|
|
168
|
+
selectorOverrides,
|
|
169
|
+
profileId: options.profile,
|
|
170
|
+
profilesPath
|
|
171
|
+
}, message => console.log(message));
|
|
172
|
+
|
|
173
|
+
console.log('\nPreview Summary');
|
|
174
|
+
console.log(`Title: ${preview.title}`);
|
|
175
|
+
console.log(`Description: ${preview.description}`);
|
|
176
|
+
console.log(`Preset: ${preview.presetLabel}`);
|
|
177
|
+
if (preview.profileLabel) {
|
|
178
|
+
console.log(`Profile: ${preview.profileLabel}`);
|
|
179
|
+
}
|
|
180
|
+
if (preview.materials.length > 0) {
|
|
181
|
+
console.log(`Materials (${preview.materials.length}): ${preview.materials.join(' | ')}`);
|
|
182
|
+
}
|
|
183
|
+
if (preview.instructions.length > 0) {
|
|
184
|
+
console.log(`Instructions (${preview.instructions.length}): ${preview.instructions.slice(0, 5).join(' | ')}`);
|
|
185
|
+
}
|
|
186
|
+
console.log(`Images found: ${preview.imageUrls.length}`);
|
|
187
|
+
console.log(`PDFs found: ${preview.pdfUrls.length}`);
|
|
188
|
+
return;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
await scrapeUrls({
|
|
192
|
+
urls,
|
|
193
|
+
outputPath,
|
|
194
|
+
preset: options.preset as SelectorPresetId,
|
|
195
|
+
profileId: options.profile,
|
|
196
|
+
profilesPath,
|
|
197
|
+
selectorOverrides,
|
|
198
|
+
crawl: {
|
|
199
|
+
enabled: Boolean(options.crawl),
|
|
200
|
+
maxDepth: Number.parseInt(options.crawlDepth, 10),
|
|
201
|
+
sameDomainOnly: !options.crawlAnyDomain,
|
|
202
|
+
linkPattern: options.crawlPattern,
|
|
203
|
+
maxDiscoveredUrls: Number.parseInt(options.crawlMaxUrls, 10),
|
|
204
|
+
language: options.crawlLanguage,
|
|
205
|
+
paginate: Boolean(options.crawlPaginate),
|
|
206
|
+
maxPaginationPages: Number.parseInt(options.crawlMaxPages, 10)
|
|
207
|
+
},
|
|
208
|
+
workingDirectory: outputDirectory,
|
|
209
|
+
logger: message => console.log(message)
|
|
210
|
+
});
|
|
211
|
+
} catch (error: any) {
|
|
212
|
+
console.error(`\n❌ ${error.message}`);
|
|
213
|
+
process.exitCode = 1;
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
run();
|