codescoop 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,455 @@
1
+ /**
2
+ * HTML Parser Module
3
+ * Parses HTML and extracts target elements by selector or line range
4
+ */
5
+
6
+ const cheerio = require('cheerio');
7
+
8
+ /**
9
+ * Parse HTML content into a cheerio instance
10
+ * @param {string} htmlContent - Raw HTML string
11
+ * @returns {CheerioAPI} Cheerio instance
12
+ */
13
+ function parseHTML(htmlContent) {
14
+ return cheerio.load(htmlContent, {
15
+ recognizeSelfClosing: true,
16
+ lowerCaseTags: false,
17
+ lowerCaseAttributeNames: false
18
+ });
19
+ }
20
+
21
+ /**
22
+ * Extract target element information
23
+ * @param {CheerioAPI} $ - Cheerio instance
24
+ * @param {string} htmlContent - Original HTML content (for line number calculation)
25
+ * @param {Object} options
26
+ * @param {string} options.selector - CSS selector
27
+ * @param {string} options.lineRange - Line range (e.g., "45-80")
28
+ * @param {number} options.matchIndex - Which match to use if multiple (0-based, default: 0)
29
+ * @returns {Object} Target element information
30
+ */
31
+ function extractTargetElement($, htmlContent, options) {
32
+ const { selector, lineRange, matchIndex = 0 } = options;
33
+
34
+ let targetElement;
35
+ let targetHtml;
36
+ let startLine, endLine;
37
+ let matchCount = 0;
38
+ let warning = null;
39
+
40
+ if (selector) {
41
+ // Validate selector syntax
42
+ const normalizedSelector = normalizeSelector(selector);
43
+
44
+ // Check for problematic selectors
45
+ const problematicTags = ['body', 'html', 'head'];
46
+ const selectorLower = normalizedSelector.toLowerCase().trim();
47
+
48
+ if (problematicTags.includes(selectorLower)) {
49
+ const suggestions = {
50
+ 'body': 'Targeting <body> analyzes the entire page. Try targeting a specific section like "main", "header", or a class like ".content".',
51
+ 'html': 'Targeting <html> includes everything. Try targeting "body > *" for top-level elements or specific components.',
52
+ 'head': 'The <head> tag contains metadata, not visible components. Try targeting visible elements like "header" or "nav".'
53
+ };
54
+ throw new Error(`Cannot analyze "${selectorLower}" as a component.\n\n${suggestions[selectorLower]}`);
55
+ }
56
+
57
+ // Warn about very broad selectors
58
+ const broadSelectors = ['div', 'span', 'p', 'a', 'li', 'ul', 'section'];
59
+ if (broadSelectors.includes(selectorLower)) {
60
+ console.warn('\x1b[33m%s\x1b[0m', `⚠️ Selector "${selectorLower}" is very broad and will match many elements.`);
61
+ console.warn('\x1b[33m%s\x1b[0m', ` Consider using a more specific selector like ".${selectorLower}-class" or "#${selectorLower}-id".`);
62
+ }
63
+
64
+ // Note about semantic tags (these are fine but informational)
65
+ const semanticTags = ['header', 'footer', 'nav', 'main', 'aside', 'article'];
66
+ if (semanticTags.includes(selectorLower)) {
67
+ console.log('\x1b[36m%s\x1b[0m', `ℹ️ Targeting semantic element <${selectorLower}>. This is typically unique per page.`);
68
+ }
69
+
70
+ // Find all matching elements
71
+ const allMatches = $(normalizedSelector);
72
+ matchCount = allMatches.length;
73
+
74
+ if (matchCount === 0) {
75
+ // Try to provide helpful error message
76
+ const suggestions = getSelectorSuggestions(selector);
77
+ let errorMsg = `No element found matching selector: ${selector}`;
78
+ if (suggestions) {
79
+ errorMsg += `\n\nDid you mean:\n${suggestions}`;
80
+ }
81
+ throw new Error(errorMsg);
82
+ }
83
+
84
+ // Warn if multiple matches found
85
+ if (matchCount > 1) {
86
+ warning = `⚠️ Found ${matchCount} elements matching "${selector}". Using the first one (index ${matchIndex}).`;
87
+ warning += `\n Use --match-index N to select a different one, or use a more specific selector.`;
88
+ console.warn('\x1b[33m%s\x1b[0m', warning); // Yellow warning
89
+ }
90
+
91
+ // Get the specified match (default: first)
92
+ const actualIndex = Math.min(matchIndex, matchCount - 1);
93
+ targetElement = allMatches.eq(actualIndex);
94
+
95
+ if (targetElement.length === 0) {
96
+ throw new Error(`Element at index ${matchIndex} not found. Only ${matchCount} matches available.`);
97
+ }
98
+
99
+ targetHtml = $.html(targetElement);
100
+
101
+ // Calculate line numbers
102
+ const outerHtml = $.html(targetElement);
103
+ const position = htmlContent.indexOf(outerHtml);
104
+ if (position !== -1) {
105
+ const beforeTarget = htmlContent.substring(0, position);
106
+ startLine = (beforeTarget.match(/\n/g) || []).length + 1;
107
+ endLine = startLine + (outerHtml.match(/\n/g) || []).length;
108
+ }
109
+ } else if (lineRange) {
110
+ // Validate line range format
111
+ if (!/^\d+-\d+$/.test(lineRange)) {
112
+ throw new Error(`Invalid line range format: "${lineRange}". Use format like "45-80".`);
113
+ }
114
+
115
+ const [start, end] = lineRange.split('-').map(Number);
116
+
117
+ // Validate line numbers
118
+ const totalLines = htmlContent.split('\n').length;
119
+ if (start < 1 || end < 1) {
120
+ throw new Error(`Line numbers must be positive. Got: ${lineRange}`);
121
+ }
122
+ if (start > end) {
123
+ throw new Error(`Start line (${start}) cannot be greater than end line (${end}).`);
124
+ }
125
+ if (end > totalLines) {
126
+ throw new Error(`End line (${end}) exceeds file length (${totalLines} lines).`);
127
+ }
128
+
129
+ startLine = start;
130
+ endLine = end;
131
+
132
+ const lines = htmlContent.split('\n');
133
+ const targetLines = lines.slice(start - 1, end);
134
+ targetHtml = targetLines.join('\n');
135
+
136
+ // Parse the extracted HTML to get element info
137
+ const $target = cheerio.load(targetHtml);
138
+ targetElement = $target('body').children().first();
139
+
140
+ if (targetElement.length === 0) {
141
+ // If no proper element, treat the whole selection as the target
142
+ targetElement = $target('body');
143
+ }
144
+ } else {
145
+ throw new Error('Either selector or lineRange must be provided');
146
+ }
147
+
148
+ // Extract metadata from target element
149
+ const classes = extractClasses(targetElement, $);
150
+ const ids = extractIds(targetElement, $);
151
+ const dataAttributes = extractDataAttributes(targetElement, $);
152
+ const shadowParts = extractShadowParts(targetElement, $);
153
+ const tagName = targetElement.prop('tagName')?.toLowerCase() || 'div';
154
+
155
+ // Warn if no classes or IDs found
156
+ if (classes.length === 0 && ids.length === 0) {
157
+ console.warn('\x1b[33m%s\x1b[0m', '⚠️ Target element has no classes or IDs. CSS/JS detection may be limited.');
158
+ }
159
+
160
+ // Generate a summary description
161
+ const summary = generateSummary(tagName, classes, ids);
162
+
163
+ return {
164
+ html: targetHtml,
165
+ classes,
166
+ ids,
167
+ dataAttributes,
168
+ shadowParts,
169
+ tagName,
170
+ startLine,
171
+ endLine,
172
+ summary,
173
+ selector: selector || `lines ${lineRange}`,
174
+ matchCount,
175
+ warning
176
+ };
177
+ }
178
+
179
+ /**
180
+ * Normalize selector - handle common mistakes
181
+ * @param {string} selector - User-provided selector
182
+ * @returns {string} Normalized selector
183
+ */
184
+ function normalizeSelector(selector) {
185
+ let normalized = selector.trim();
186
+
187
+ // If selector looks like space-separated classes without dots, fix it
188
+ // e.g., "btn primary large" -> ".btn.primary.large"
189
+ if (!normalized.startsWith('.') &&
190
+ !normalized.startsWith('#') &&
191
+ !normalized.includes('[') &&
192
+ !normalized.includes('>') &&
193
+ !normalized.includes(' ') === false &&
194
+ /^[a-zA-Z0-9_-]+(\s+[a-zA-Z0-9_-]+)+$/.test(normalized)) {
195
+ // This looks like multiple class names without dots
196
+ const parts = normalized.split(/\s+/);
197
+ // Only auto-fix if all parts look like class names (no HTML tags)
198
+ const htmlTags = ['div', 'span', 'header', 'footer', 'nav', 'section', 'article', 'aside', 'main', 'p', 'a', 'ul', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'];
199
+ if (!parts.some(p => htmlTags.includes(p.toLowerCase()))) {
200
+ console.warn('\x1b[33m%s\x1b[0m', `⚠️ Selector "${selector}" looks like class names. Auto-converting to ".${parts.join('.')}"`, `\n For exact match, use: ".${parts.join('.')}"`);
201
+ normalized = '.' + parts.join('.');
202
+ }
203
+ }
204
+
205
+ return normalized;
206
+ }
207
+
208
+ /**
209
+ * Get selector suggestions when element not found
210
+ */
211
+ function getSelectorSuggestions(selector) {
212
+ const suggestions = [];
213
+
214
+ // If missing dot for class
215
+ if (!selector.startsWith('.') && !selector.startsWith('#') && !selector.includes('[')) {
216
+ if (/^[a-zA-Z][a-zA-Z0-9_-]*$/.test(selector)) {
217
+ suggestions.push(` - .${selector} (as a class)`);
218
+ suggestions.push(` - #${selector} (as an ID)`);
219
+ suggestions.push(` - ${selector} (as a tag name)`);
220
+ }
221
+ }
222
+
223
+ // If has spaces (might be trying to list classes)
224
+ if (selector.includes(' ') && !selector.includes('>')) {
225
+ const parts = selector.split(/\s+/).filter(p => p);
226
+ if (parts.every(p => /^[a-zA-Z][a-zA-Z0-9_-]*$/.test(p))) {
227
+ suggestions.push(` - .${parts.join('.')} (element with ALL these classes)`);
228
+ suggestions.push(` - .${parts[0]} (just the first class)`);
229
+ }
230
+ }
231
+
232
+ return suggestions.length > 0 ? suggestions.join('\n') : null;
233
+ }
234
+
235
+ /**
236
+ * Extract all classes from element and its children
237
+ */
238
+ function extractClasses(element, $) {
239
+ const classes = new Set();
240
+
241
+ // Get classes from the element itself
242
+ const elementClasses = element.attr('class');
243
+ if (elementClasses) {
244
+ elementClasses.split(/\s+/).forEach(cls => {
245
+ if (cls.trim()) classes.add(cls.trim());
246
+ });
247
+ }
248
+
249
+ // Get classes from all children
250
+ element.find('*').each((_, child) => {
251
+ const childClasses = $(child).attr('class');
252
+ if (childClasses) {
253
+ childClasses.split(/\s+/).forEach(cls => {
254
+ if (cls.trim()) classes.add(cls.trim());
255
+ });
256
+ }
257
+ });
258
+
259
+ return Array.from(classes);
260
+ }
261
+
262
+ /**
263
+ * Extract all IDs from element and its children
264
+ */
265
+ function extractIds(element, $) {
266
+ const ids = new Set();
267
+
268
+ // Get ID from the element itself
269
+ const elementId = element.attr('id');
270
+ if (elementId) {
271
+ ids.add(elementId);
272
+ }
273
+
274
+ // Get IDs from all children
275
+ element.find('*').each((_, child) => {
276
+ const childId = $(child).attr('id');
277
+ if (childId) {
278
+ ids.add(childId);
279
+ }
280
+ });
281
+
282
+ return Array.from(ids);
283
+ }
284
+
285
+ /**
286
+ * Extract all data-* attributes from element and its children
287
+ */
288
+ function extractDataAttributes(element, $) {
289
+ const dataAttrs = new Set();
290
+
291
+ const extractFromElement = (el) => {
292
+ const attribs = el.attribs || {};
293
+ Object.keys(attribs).forEach(attr => {
294
+ if (attr.startsWith('data-')) {
295
+ dataAttrs.add(attr);
296
+ }
297
+ });
298
+ };
299
+
300
+ // Get from element itself
301
+ if (element[0]) {
302
+ extractFromElement(element[0]);
303
+ }
304
+
305
+ // Get from all children
306
+ element.find('*').each((_, child) => {
307
+ extractFromElement(child);
308
+ });
309
+
310
+ return Array.from(dataAttrs);
311
+ }
312
+
313
+ /**
314
+ * Extract 'part' attributes for Shadow DOM matching
315
+ */
316
+ function extractShadowParts(element, $) {
317
+ const parts = new Set();
318
+
319
+ const extractFromElement = (el) => {
320
+ const part = $(el).attr('part');
321
+ if (part) {
322
+ part.split(/\s+/).forEach(p => {
323
+ if (p.trim()) parts.add(p.trim());
324
+ });
325
+ }
326
+ };
327
+
328
+ // Get from element itself
329
+ if (element[0]) {
330
+ extractFromElement(element[0]);
331
+ }
332
+
333
+ // Get from all children
334
+ element.find('*').each((_, child) => {
335
+ extractFromElement(child);
336
+ });
337
+
338
+ return Array.from(parts);
339
+ }
340
+
341
+ /**
342
+ * Generate a human-readable summary of the target
343
+ */
344
+ function generateSummary(tagName, classes, ids) {
345
+ let summary = `<${tagName}`;
346
+
347
+ if (ids.length > 0) {
348
+ summary += ` id="${ids[0]}"`;
349
+ }
350
+
351
+ if (classes.length > 0) {
352
+ const classStr = classes.slice(0, 3).join(' ');
353
+ summary += ` class="${classStr}${classes.length > 3 ? '...' : ''}"`;
354
+ }
355
+
356
+ summary += '>';
357
+
358
+ return summary;
359
+ }
360
+
361
+ /**
362
+ * Get the HTML structure for interactive mode
363
+ * @param {CheerioAPI} $ - Cheerio instance
364
+ * @returns {Array} Array of structure items
365
+ */
366
+ function getHTMLStructure($) {
367
+ const structure = [];
368
+ let index = 1;
369
+
370
+ // Get major structural elements
371
+ const majorTags = ['header', 'nav', 'main', 'section', 'article', 'aside', 'footer', 'div[id]', 'div[class]'];
372
+
373
+ $('body').children().each((_, element) => {
374
+ const $el = $(element);
375
+ const tagName = element.tagName?.toLowerCase();
376
+
377
+ if (!tagName || tagName === 'script' || tagName === 'style') {
378
+ return;
379
+ }
380
+
381
+ const id = $el.attr('id');
382
+ const classes = $el.attr('class')?.split(/\s+/).filter(c => c.trim()) || [];
383
+
384
+ let selector;
385
+ if (id) {
386
+ selector = `#${id}`;
387
+ } else if (classes.length > 0) {
388
+ selector = `.${classes[0]}`;
389
+ } else {
390
+ selector = tagName;
391
+ }
392
+
393
+ // Create display string
394
+ let display = `<${tagName}`;
395
+ if (id) display += ` id="${id}"`;
396
+ if (classes.length > 0) {
397
+ display += ` class="${classes.slice(0, 2).join(' ')}${classes.length > 2 ? '...' : ''}"`;
398
+ }
399
+ display += '>';
400
+
401
+ structure.push({
402
+ index: index++,
403
+ tagName,
404
+ selector,
405
+ display,
406
+ hasChildren: $el.children().length > 0
407
+ });
408
+
409
+ // Add immediate children that have IDs or significant classes
410
+ $el.children().each((_, child) => {
411
+ const $child = $(child);
412
+ const childTag = child.tagName?.toLowerCase();
413
+
414
+ if (!childTag || childTag === 'script' || childTag === 'style') {
415
+ return;
416
+ }
417
+
418
+ const childId = $child.attr('id');
419
+ const childClasses = $child.attr('class')?.split(/\s+/).filter(c => c.trim()) || [];
420
+
421
+ // Only include children with IDs or semantic classes
422
+ if (childId || childClasses.length > 0) {
423
+ let childSelector;
424
+ if (childId) {
425
+ childSelector = `#${childId}`;
426
+ } else {
427
+ childSelector = `.${childClasses[0]}`;
428
+ }
429
+
430
+ let childDisplay = ` └── <${childTag}`;
431
+ if (childId) childDisplay += ` id="${childId}"`;
432
+ if (childClasses.length > 0) {
433
+ childDisplay += ` class="${childClasses.slice(0, 2).join(' ')}${childClasses.length > 2 ? '...' : ''}"`;
434
+ }
435
+ childDisplay += '>';
436
+
437
+ structure.push({
438
+ index: index++,
439
+ tagName: childTag,
440
+ selector: childSelector,
441
+ display: childDisplay,
442
+ isChild: true
443
+ });
444
+ }
445
+ });
446
+ });
447
+
448
+ return structure;
449
+ }
450
+
451
+ module.exports = {
452
+ parseHTML,
453
+ extractTargetElement,
454
+ getHTMLStructure
455
+ };