npm - njsparser - Versions diffs - 0.1.0 → 0.2.0 - Mend

njsparser 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

package/README.md +390 -40
package/api.js +76 -50
package/bun.lock +2 -48
package/mod.js +148 -0
package/package.json +11 -16
package/parser/flight_data.js +189 -306
package/parser/manifests.js +37 -37
package/parser/next_data.js +29 -26
package/parser/types.js +408 -296
package/parser/urls.js +86 -56
package/tests/api.test.js +96 -0
package/tests/integration.test.js +68 -0
package/tests/parser/flight_data.test.js +105 -0
package/tests/parser/manifests.test.js +50 -0
package/tests/parser/next_data.test.js +53 -0
package/tests/parser/types.test.js +243 -0
package/tests/parser/urls.test.js +84 -0
package/tests/property.test.js +299 -0
package/tests/setup.js +21 -0
package/tests/utils.test.js +32 -0
package/tools.js +263 -185
package/utils.js +29 -24
package/_.js +0 -10
package/_.json +0 -12837
package/api.test.js +0 -41
package/index.js +0 -8
package/package-lock.json +0 -291
package/parser/flight_data.test.js +0 -59
package/parser/manifests.test.js +0 -36
package/parser/next_data.test.js +0 -15
package/parser/types.test.js +0 -261
package/parser/urls.test.js +0 -26
package/test/src/index.js +0 -16
package/tools.test.js +0 -153
package/utils.test.js +0 -38

package/parser/flight_data.js CHANGED Viewed

@@ -1,333 +1,216 @@
-import { make_tree } from '../utils.js';
-import { resolve_type } from './types.js';
+/**
+ * Flight data extraction and parsing
+ */
-const _raw_f_data = []; // Type annotation placeholder
-const _re_f_init = /\(self\.__next_f\s?=\s?self\.__next_f\s?\|\|\s?\[\]\)\.push\((\[.+?\])\)/;
-const _re_f_payload = /self\.__next_f\.push\((\[.+)\)$/;
+import { makeTree } from '../utils.js';
+import { resolveType } from './types.js';
+// Regex patterns for matching flight data scripts
+const RE_F_INIT = /\(self\.__next_f\s?=\s?self\.__next_f\s?\|\|\s?\[\]\)\.push\((\[.+?\])\)/;
+const RE_F_PAYLOAD = /self\.__next_f\.push\((\[.+)\)$/;
+// Segment types
+const Segment = {
+  is_bootstrap: 0,
+  is_not_bootstrap: 1,
+  is_form_state: 2,
+  is_binary: 3
+};
 /**
- * Tells if a given page contains any flight data.
- * @param {any} value
- * @returns {boolean}
+ * Check if HTML contains flight data
+ * @param {string} html - HTML string
+ * @param {DOMParser} DOMParser - DOMParser instance
+ * @returns {boolean} True if flight data exists
  */
-export const has_flight_data = (value) => {
-    const $ = make_tree(value);
-    const scripts = $('script');
-    let found = false;
-    scripts.each((_, el) => {
-        const text = $(el).text(); // Use text() instead of html() for scripts content?
-        // cheerio .text() gets text content. .html() gets innerHTML.
-        // Usually safer to use .html() or .text() depending on encoding.
-        // Python lxml xpath text() gets text.
-        if (_re_f_init.test(text)) {
-            found = true;
-            return false; // break
-        }
-    });
-    return found;
-};
+export function hasFlightData(html, DOMParser) {
+  const doc = makeTree(html, DOMParser);
+  const scripts = Array.from(doc.querySelectorAll('script')).map(s => s.textContent || '');
+  return scripts.some(script => RE_F_INIT.test(script));
+}
 /**
- * Will return the raw flight data.
- * @param {any} value
- * @returns {Array | null}
+ * Extract raw flight data from HTML
+ * @param {string} html - HTML string
+ * @param {DOMParser} DOMParser - DOMParser instance
+ * @returns {Array|null} Raw flight data array or null
  */
-export const get_raw_flight_data = (value) => {
-    const $ = make_tree(value);
-    const result = [];
-    let found_init = false;
+export function getRawFlightData(html, DOMParser) {
+  const doc = makeTree(html, DOMParser);
+  const scripts = Array.from(doc.querySelectorAll('script')).map(s => s.textContent || '');
+  const result = [];
+  let foundInit = false;
+  for (const script of scripts) {
+    const trimmed = script.trim();
-    $('script').each((_, el) => {
-        const script = $(el).text()?.trim();
-        if (!script) return;
-        let init_match;
-        if (!found_init && (init_match = script.match(_re_f_init))) {
-            found_init = true;
-            try {
-                result.push(JSON.parse(init_match[1])); // Match group 1 is `[...]`
-            } catch (e) {
-                console.warn("Failed to parse init flight data", e);
-            }
-        }
-        // Note: The python regex for payload might match the same script as init if it has both?
-        // Actually the init regex is for the push call structure used initially.
-        // The payload regex matches `self.__next_f.push([...])` at the end of string.
-        let payload_match;
-        if ((payload_match = script.match(_re_f_payload))) {
-             try {
-                result.push(JSON.parse(payload_match[1]));
-            } catch (e) {
-                console.warn("Failed to parse payload flight data", e);
-            }
-        }
-    });
+    // Check for initialization script
+    if (!foundInit) {
+      const initMatch = trimmed.match(RE_F_INIT);
+      if (initMatch) {
+        foundInit = true;
+        result.push(JSON.parse(initMatch[1]));
+      }
+    }
-    return result.length > 0 ? result : null;
-};
-export const Segment = {
-    is_bootstrap: 0,
-    is_not_bootstrap: 1,
-    is_form_state: 2,
-    is_binary: 3
-};
+    // Check for payload script
+    const payloadMatch = trimmed.match(RE_F_PAYLOAD);
+    if (payloadMatch) {
+      result.push(JSON.parse(payloadMatch[1]));
+    }
+  }
+  return result.length > 0 ? result : null;
+}
 /**
- * Decodes the raw flight data.
- * @param {Array} raw_flight_data
- * @returns {string[]}
+ * Decode raw flight data segments
+ * @param {Array} rawFlightData - Raw flight data array
+ * @returns {Array<string>} Decoded flight data chunks
  */
-export const decode_raw_flight_data = (raw_flight_data) => {
-    let initial_server_data_buffer = null;
-    let initial_form_state_data = null;
+export function decodeRawFlightData(rawFlightData) {
+  let initialServerDataBuffer;
+  let initialFormStateData;
+  for (const seg of rawFlightData) {
+    const segmentType = seg[0];
-    for (const seg of raw_flight_data) {
-        if (seg[0] === Segment.is_bootstrap) {
-            initial_server_data_buffer = [];
-        } else if (seg[0] === Segment.is_not_bootstrap) {
-            if (initial_server_data_buffer === null) {
-                throw new Error('UnboundLocalError: initial_server_data_buffer was not yet initialized');
-            }
-            initial_server_data_buffer.push(seg[1]);
-        } else if (seg[0] === Segment.is_form_state) {
-            initial_form_state_data = seg[1];
-        } else if (seg[0] === Segment.is_binary) {
-            if (initial_server_data_buffer === null) {
-                throw new Error('UnboundLocalError: initial_server_data_buffer was not yet initialized');
-            }
-            const buffer = Buffer.from(seg[1], 'base64'); // base64 decode
-            initial_server_data_buffer.push(buffer.toString('utf-8'));
-        } else {
-            throw new Error(`Unknown segment type ${seg[0]}`);
-        }
+    if (segmentType === Segment.is_bootstrap) {
+      initialServerDataBuffer = [];
+    } else if (segmentType === Segment.is_not_bootstrap) {
+      if (initialServerDataBuffer === undefined) {
+        throw new Error(
+          'The `initialServerDataBuffer` was not yet initialized and a segment tried to append its data to it. ' +
+          'This should not be happening if the flight data starts correctly with a the `is_bootstrap` segment.'
+        );
+      }
+      initialServerDataBuffer.push(seg[1]);
+    } else if (segmentType === Segment.is_form_state) {
+      initialFormStateData = seg[1];
+    } else if (segmentType === Segment.is_binary) {
+      if (initialServerDataBuffer === undefined) {
+        throw new Error(
+          'The `initialServerDataBuffer` was not yet initialized and a segment tried to append its data to it. ' +
+          'This should not be happening if the flight data starts correctly with a the `is_bootstrap` segment.'
+        );
+      }
+      // Decode base64
+      const decodedChunk = atob(seg[1]);
+      initialServerDataBuffer.push(decodedChunk);
+    } else {
+      throw new Error(`Unknown segment type seg[0]=${segmentType}`);
     }
-    if (initial_server_data_buffer === null) {
-        // As per python logic, it raises Error if used before init.
-        // But if loop finishes without init, it just returns null in JS?
-        // Python returns `initial_server_data_buffer` which would be uninitialized if first loop didn't run or verify.
-        // But the first segment SHOULD be bootstrap.
-        // If raw_flight_data is empty, it returns unbound error in Python?
-        // Actually Python code: `return initial_server_data_buffer` -> if not assigned, UnboundLocalError.
-        throw new Error('UnboundLocalError: initial_server_data_buffer not initialized (empty data?)');
+  }
+  return initialServerDataBuffer;
+}
+/**
+ * Parse decoded raw flight data into structured objects
+ * @param {Array<string>} decodedRawFlightData - Decoded flight data chunks
+ * @returns {Object} Dictionary mapping indices to parsed elements
+ */
+export function parseDecodedRawFlightData(decodedRawFlightData) {
+  // Join and encode to bytes
+  const compiledRawFlightData = new TextEncoder().encode(decodedRawFlightData.join(''));
+  const indexedResult = {};
+  let pos = 0;
+  while (true) {
+    const indexStringEnd = compiledRawFlightData.indexOf(58, pos); // ':'
+    if (indexStringEnd === -1) {
+      break;
     }
-    return initial_server_data_buffer;
-};
+    const indexStringRaw = compiledRawFlightData.slice(pos, indexStringEnd);
+    let index = null;
+    if (indexStringRaw.length > 0) {
+      const indexStr = new TextDecoder().decode(indexStringRaw);
+      index = parseInt(indexStr, 16);
+    }
+    pos = indexStringEnd + 1;
-const _split_points = /(?<!\\)\n[a-f0-9]*:/g;
-// JS regex lookbehind support depends on engine. Bun supports it (V8/JSC).
-// But `\n` matching might be issue if buffer.
-// We are working with bytes in Python logic.
-// JS regex works on strings.
-// But we need to find split points in the *encoded* bytes?
-// Python: `_split_points = re.compile(rb"(?<!\\)\n[a-f0-9]*:")` (bytes regex)
+    // Extract value class (uppercase letters)
+    let valueClass = '';
+    while (pos < compiledRawFlightData.length) {
+      const char = String.fromCharCode(compiledRawFlightData[pos]);
+      if (/[A-Z]/.test(char)) {
+        valueClass += char;
+        pos++;
+      } else {
+        break;
+      }
+    }
+    valueClass = valueClass || null;
-// We need to implement `parse_decoded_raw_flight_data` using Buffers to match Python logic precisely.
+    let value;
-/**
- * Parses decoded raw flight data.
- * @param {string[]} decoded_raw_flight_data
- * @returns {object}
- */
-export const parse_decoded_raw_flight_data = (decoded_raw_flight_data) => {
-    const combinedString = decoded_raw_flight_data.join("");
-    const buffer = Buffer.from(combinedString); // UTF-8 encoded buffer
-    const indexed_result = {};
-    let pos = 0;
-    while (true) {
-        // Find index string end ":", starting from pos
-        const colonIndex = buffer.indexOf(58, pos); // 58 is ':'
-        if (colonIndex === -1) {
-            break;
-        }
-        const indexStringBuf = buffer.subarray(pos, colonIndex);
-        let index = null;
-        if (indexStringBuf.length > 0) {
-            const indexString = indexStringBuf.toString();
-            try {
-                 index = parseInt(indexString, 16);
-            } catch (e) {
-                // Ignore?
+    if (valueClass === 'T') {
+      const textLengthStringEnd = compiledRawFlightData.indexOf(44, pos); // ','
+      const textLengthHex = compiledRawFlightData.slice(pos, textLengthStringEnd);
+      const textLength = parseInt(new TextDecoder().decode(textLengthHex), 16);
+      const textStart = textLengthStringEnd + 1;
+      value = new TextDecoder().decode(compiledRawFlightData.slice(textStart, textStart + textLength));
+      pos = textStart + textLength;
+    } else {
+      // Find next split point
+      let dataEnd = -1;
+      for (let i = pos; i < compiledRawFlightData.length - 1; i++) {
+        if (compiledRawFlightData[i] === 10) { // '\n'
+          if (i === 0 || compiledRawFlightData[i - 1] !== 92) { // not escaped
+            let j = i + 1;
+            while (j < compiledRawFlightData.length && /[0-9a-f]/.test(String.fromCharCode(compiledRawFlightData[j]))) {
+              j++;
             }
-        }
-        pos = colonIndex + 1;
-        // Iterate while char is uppercase letter
-        let value_class = "";
-        while (pos < buffer.length) {
-            const byte = buffer[pos];
-            const char = String.fromCharCode(byte);
-            if (/[A-Z]/.test(char)) {
-                value_class += char;
-                pos++;
-            } else {
-                break;
-            }
-        }
-        if (value_class === "") value_class = null;
-        let raw_value_str;
-        let value;
-        if (value_class === "T") {
-            // Find comma
-            const commaIndex = buffer.indexOf(44, pos); // 44 is ','
-            if (commaIndex === -1) throw new Error("Expected comma after 'T' class size");
-            const lenHexBuf = buffer.subarray(pos, commaIndex);
-            const textLength = parseInt(lenHexBuf.toString(), 16);
-            const textStart = commaIndex + 1;
-            const textEnd = textStart + textLength;
-            const textBuf = buffer.subarray(textStart, textEnd);
-            raw_value_str = textBuf.toString('utf-8');
-            value = raw_value_str;
-            pos = textEnd;
-        } else {
-            // Search for next split point: `\n` followed by hex+colon
-            // We can search for `\n` and check pattern.
-            // Loop until found or end
-            let nextSplitPos = -1;
-            let searchPos = pos;
-            while (true) {
-                const newlineIndex = buffer.indexOf(10, searchPos); // 10 is '\n'
-                if (newlineIndex === -1) {
-                    break;
-                }
-                // Check lookbehind: `(?<!\\)` -> char before `\n` should not be `\` (92)
-                let isEscaped = false;
-                if (newlineIndex > 0 && buffer[newlineIndex - 1] === 92) {
-                    isEscaped = true;
-                }
-                if (!isEscaped) {
-                    // Check lookahead: `[a-f0-9]*:`
-                    // We scan from newlineIndex + 1 for hex chars then colon
-                    let p = newlineIndex + 1;
-                    let isMatch = true;
-                    while (p < buffer.length) {
-                        const b = buffer[p];
-                        if (b === 58) { // found colon
-                            break;
-                        }
-                        const c = String.fromCharCode(b);
-                        if (!/[a-f0-9]/.test(c)) {
-                            isMatch = false;
-                            break;
-                        }
-                        p++;
-                    }
-                    // If we stopped at colon, it's a match
-                    if (isMatch && p < buffer.length && buffer[p] === 58) {
-                        nextSplitPos = newlineIndex; // The split starts at `\n`
-                        break;
-                    }
-                }
-                searchPos = newlineIndex + 1;
-            }
-            if (nextSplitPos !== -1) {
-                const valBuf = buffer.subarray(pos, nextSplitPos);
-                raw_value_str = valBuf.toString('utf-8');
-                pos = nextSplitPos + 1; // Skip the newline
-            } else {
-                // Until end
-                // Python: `raw_value = compiled_raw_flight_data[pos:-1]`
-                // Wait, [pos:-1] removes the LAST byte?
-                // Why?
-                // Ah, because `compiled_raw_flight_data` in python might have a trailing `\n` or similar?
-                // Or maybe just generic slice logic?
-                // `raw_value = compiled_raw_flight_data[pos:data_end]` (excludes `\n`)
-                // If it goes to end, `pos:-1`.
-                // Let's assume it means "up to the last character".
-                // Python slice `[pos:-1]` includes from pos up to (but not including) the last item.
-                // Does flight data always end with a newline valid as split point?
-                // If the stream ends, it might not have the next split marker.
-                // But why exclude the last char?
-                // Maybe the stream ends with a newline?
-                // Let's check Python code again.
-                // `raw_value = compiled_raw_flight_data[pos:-1]`
-                // `pos += len(raw_value)`
-                // This implies it consumes everything EXCEPT the very last byte.
-                // Is there a phantom byte at the end? `compiled_raw_flight_data` is just `.join().encode()`.
-                // If I decode `raw_flight_data`, join them, encode them.
-                // Maybe I should match strict Python behavior.
-                // `combinedString` in JS vs Python.
-                // If I have "foo", [0:-1] is "fo".
-                // Let's assume for now I should take everything. The `-1` in Python is suspicious unless I know why.
-                // Maybe `_split_points` regex matching behavior in loop?
-                // If I am at the last chunk, it might not end with `\n...:`.
-                // If I simply take everything `buffer.subarray(pos)`, I might include a trailing newline that effectively belongs to the "next" but nonexistent chunk?
-                // But `pos` is advanced.
-                // WAIT. If `_split_points` finds a match, `data_end` is the start of `\n`.
-                // `raw_value = ...[pos:data_end]`.
-                // If `else` (no match), `raw_value = [pos:-1]`.
-                // This definitely drops the last byte.
-                // I will replicate this behavior: `buffer.subarray(pos, buffer.length - 1)`.
-                const valBuf = buffer.subarray(pos, buffer.length - 1);
-                raw_value_str = valBuf.toString('utf-8');
-                pos += valBuf.length;
-                // And loop will terminate because pos vs buffer.length check or colonIndex search
-            }
-            try {
-                value = JSON.parse(raw_value_str);
-            } catch (e) {
-                // If JSON parse fails, keep string? Python code: `value = orjson.loads(raw_value)`
-                // It assumes valid JSON.
-                value = raw_value_str;
+            if (j < compiledRawFlightData.length && compiledRawFlightData[j] === 58) {
+              dataEnd = i;
+              break;
             }
+          }
         }
-        const resolved = resolve_type({
-            value: value,
-            value_class: value_class,
-            index: index
-        });
-        if (index === null) {
-            // Wait, why index as key if it is null?
-            // Python: `if index not in indexed_result: ...` -> index is None.
-            // `indexed_result[None] = []`
-            // JS objects keys are strings. "null".
-            // If index is null, key is "null".
-            if (!indexed_result["null"]) {
-                indexed_result["null"] = [];
-            }
-            indexed_result["null"].push(resolved);
-        } else {
-            indexed_result[index] = resolved;
-        }
+      }
+      const rawValue = dataEnd !== -1
+        ? compiledRawFlightData.slice(pos, dataEnd)
+        : compiledRawFlightData.slice(pos);
+      pos = dataEnd !== -1 ? dataEnd + 1 : compiledRawFlightData.length;
+      const rawText = new TextDecoder().decode(rawValue);
+      if (rawText.length === 0) {
+        value = null;
+      } else {
+        value = JSON.parse(rawText);
+      }
     }
-    return indexed_result;
-};
+    const resolved = resolveType(value, valueClass, index);
+    if (index === null) {
+      if (!(index in indexedResult)) {
+        indexedResult[index] = [];
+      }
+      indexedResult[index].push(resolved);
+    } else {
+      indexedResult[index] = resolved;
+    }
+  }
+  return indexedResult;
+}
 /**
- * Returns the flight data of the page.
- * @param {any} value
- * @returns {object | null}
+ * Get parsed flight data from HTML
+ * @param {string} html - HTML string
+ * @param {DOMParser} DOMParser - DOMParser instance
+ * @returns {Object|null} Parsed flight data or null
  */
-export const get_flight_data = (value) => {
-    const raw = get_raw_flight_data(value);
-    if (raw) {
-        const decoded = decode_raw_flight_data(raw);
-        return parse_decoded_raw_flight_data(decoded);
-    }
+export function getFlightData(html, DOMParser) {
+  const rawFlightData = getRawFlightData(html, DOMParser);
+  if (rawFlightData === null) {
     return null;
-};
+  }
+  const decodedRawFlightData = decodeRawFlightData(rawFlightData);
+  return parseDecodedRawFlightData(decodedRawFlightData);
+}

package/parser/manifests.js CHANGED Viewed

@@ -1,46 +1,46 @@
+/**
+ * Build manifest parsing
+ */
 import { join } from '../utils.js';
-import { _NS } from './urls.js';
-export const _build_manifest_name = "_buildManifest.js";
-export const _ssg_manifest_name = "_ssgManifest.js";
-export const _build_manifest_path = `/${_build_manifest_name}`;
-export const _ssg_manifest_path = `/${_ssg_manifest_name}`;
+const _NS = '/_next/static/';
+const _build_manifest_name = '_buildManifest.js';
+const _ssg_manifest_name = '_ssgManifest.js';
+const _build_manifest_path = `/${_build_manifest_name}`;
+const _ssg_manifest_path = `/${_ssg_manifest_name}`;
 export const _manifest_paths = [_build_manifest_path, _ssg_manifest_path];
 /**
- * Parses the buildmanifest script (`"/_buildManifest.js"`).
- * @param {string} script
- * @returns {object | null}
+ * Parse build manifest script
+ * @param {string} script - Build manifest script content
+ * @returns {Object} Parsed manifest object
  */
-export const parse_buildmanifest = (script) => {
-    const s = script.trim();
-    if (!s.startsWith("self.__BUILD_MANIFEST")) {
-        throw new Error('Invalid build manifest (not starting by `"self.__BUILD_MANIFEST"`).');
-    }
-    // We can use a simple evaluation mechanism or regex, but `eval` in JS is dangerous.
-    // However, since we are porting Python's pythonmonkey.eval (which is spidermonkey),
-    // it seems the intention is to execute the code.
-    // In JS context (Bun/Node), we can use `vm` or `new Function`.
-    // The script looks like: self.__BUILD_MANIFEST={...};self.__BUILD_MANIFEST_CB&&self.__BUILD_MANIFEST_CB()
-    // Let's try to emulate the browser environment slightly.
-    const mockSelf = {};
-    const func = new Function('self', s + '; return self.__BUILD_MANIFEST;');
-    try {
-        return func(mockSelf);
-    } catch (e) {
-        console.warn(`Could not parse the given build manifest \`${s}\``);
-        return null;
-    }
-};
+export function parseBuildManifest(script) {
+  const s = script.trim();
+  if (!s.startsWith('self.__BUILD_MANIFEST')) {
+    throw new Error('Invalid build manifest (not starting by `"self.__BUILD_MANIFEST"`).');
+  }
+  // Wrap in IIFE and evaluate
+  const func = `(function() {self={};${s.replace(/;$/, '')};return self.__BUILD_MANIFEST})();`;
+  try {
+    return eval(func);
+  } catch (e) {
+    console.warn(`Could not parse the given build manifest \`${s}\``);
+    throw e;
+  }
+}
 /**
- * Gives the path of the build manifest based on the given build id and base path.
- * @param {string} build_id
- * @param {string} [base_path]
- * @returns {string}
+ * Get build manifest path
+ * @param {string} buildId - Build ID
+ * @param {string} basePath - Base path (optional)
+ * @returns {string} Build manifest path
  */
-export const get_build_manifest_path = (build_id, base_path = "") => {
-    return join(base_path, _NS, build_id, _build_manifest_name);
-};
+export function getBuildManifestPath(buildId, basePath = '') {
+  return join(basePath, _NS, buildId, _build_manifest_name);
+}