@opencommerceprotocol/cli 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/cli.js +216 -2
  2. package/package.json +6 -7
package/dist/cli.js CHANGED
@@ -895,6 +895,184 @@ async function fetchSitemap(url, client, maxDepth = 3) {
895
895
  return result.urls;
896
896
  }
897
897
 
898
+ // src/utils/product-discovery.ts
899
+ var SAMPLE_BLOCKLIST = [
900
+ /\/cart\b/i,
901
+ /\/checkout\b/i,
902
+ /\/search\b/i,
903
+ /\/account\b/i,
904
+ /\/login\b/i,
905
+ /\/register\b/i,
906
+ /\/contact\b/i,
907
+ /\/about\b/i,
908
+ /\/blog\b/i,
909
+ /\/news\b/i,
910
+ /\/page\//i,
911
+ /\/pages\//i,
912
+ /\/collections?\//i,
913
+ /\/category\/?$/i,
914
+ /\/categories\/?$/i,
915
+ /\/sitemap/i,
916
+ /\/robots\.txt$/i,
917
+ /\/feed\b/i,
918
+ /\/rss\b/i
919
+ ];
920
+ function selectSampleUrls(urls, sampleSize) {
921
+ if (sampleSize <= 0 || urls.length === 0) return [];
922
+ const candidates = urls.filter((u) => {
923
+ try {
924
+ const pathname = new URL(u.loc).pathname;
925
+ return !SAMPLE_BLOCKLIST.some((re) => re.test(pathname));
926
+ } catch {
927
+ return false;
928
+ }
929
+ });
930
+ if (candidates.length <= sampleSize) return candidates;
931
+ const selected = [];
932
+ for (let i = 0; i < sampleSize; i++) {
933
+ const idx = Math.floor(i * candidates.length / sampleSize);
934
+ selected.push(candidates[idx]);
935
+ }
936
+ return selected;
937
+ }
938
+ function pathSegments(pathname) {
939
+ return pathname.split("/").filter((s) => s.length > 0);
940
+ }
941
+ function pathExtension(pathname) {
942
+ const last = pathname.split("/").pop() ?? "";
943
+ const dotIdx = last.lastIndexOf(".");
944
+ if (dotIdx <= 0) return void 0;
945
+ const ext = last.slice(dotIdx).toLowerCase();
946
+ if (!/^\.[a-z][a-z0-9]{0,5}$/.test(ext)) return void 0;
947
+ return ext;
948
+ }
949
+ function escapeRegex(s) {
950
+ return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
951
+ }
952
+ function validateCandidate(pattern, positives, negatives) {
953
+ if (positives.length === 0) return { ok: false, confidence: 0 };
954
+ const posHits = positives.filter((p) => pattern.test(p)).length;
955
+ if (posHits !== positives.length) return { ok: false, confidence: 0 };
956
+ const negHits = negatives.filter((n) => pattern.test(n)).length;
957
+ const negLeak = negatives.length > 0 ? negHits / negatives.length : 0;
958
+ if (negLeak > 0.1) return { ok: false, confidence: 0 };
959
+ return { ok: true, confidence: 1 - negLeak };
960
+ }
961
+ function tryStrategySharedSegment(positives, negatives) {
962
+ const segments = positives.map(pathSegments);
963
+ const minDepth = Math.min(...segments.map((s) => s.length));
964
+ if (minDepth === 0) return null;
965
+ for (let d = 0; d < minDepth; d++) {
966
+ const candidate = segments[0][d];
967
+ if (!candidate) continue;
968
+ const allMatch = segments.every((s) => s[d] === candidate);
969
+ if (!allMatch) continue;
970
+ const pattern = new RegExp(`\\/${escapeRegex(candidate)}\\/`, "i");
971
+ const validation = validateCandidate(pattern, positives, negatives);
972
+ if (validation.ok) {
973
+ return {
974
+ pattern,
975
+ strategy: "shared-segment",
976
+ positives,
977
+ confidence: validation.confidence
978
+ };
979
+ }
980
+ }
981
+ return null;
982
+ }
983
+ function tryStrategyExtensionSuffix(positives, negatives) {
984
+ const exts = positives.map(pathExtension);
985
+ const firstExt = exts[0];
986
+ if (!firstExt || !exts.every((e) => e === firstExt)) return null;
987
+ const pattern = new RegExp(`${escapeRegex(firstExt)}$`, "i");
988
+ const validation = validateCandidate(pattern, positives, negatives);
989
+ if (!validation.ok) return null;
990
+ return {
991
+ pattern,
992
+ strategy: "extension-suffix",
993
+ positives,
994
+ confidence: validation.confidence
995
+ };
996
+ }
997
+ function tryStrategyExtensionDepth(positives, negatives) {
998
+ const exts = positives.map(pathExtension);
999
+ const firstExt = exts[0];
1000
+ if (!firstExt || !exts.every((e) => e === firstExt)) return null;
1001
+ const depths = positives.map((p) => pathSegments(p).length);
1002
+ const minDepth = Math.min(...depths);
1003
+ if (minDepth < 2) return null;
1004
+ const pattern = new RegExp(`^(?:\\/[^\\/]+){${minDepth},}${escapeRegex(firstExt)}$`, "i");
1005
+ const validation = validateCandidate(pattern, positives, negatives);
1006
+ if (!validation.ok) return null;
1007
+ return {
1008
+ pattern,
1009
+ strategy: "extension-depth",
1010
+ positives,
1011
+ confidence: validation.confidence
1012
+ };
1013
+ }
1014
+ function tryStrategyExtensionSegment(positives, negatives) {
1015
+ const segments = positives.map(pathSegments);
1016
+ if (segments.some((s) => s.length === 0)) return null;
1017
+ const firstSeg = segments[0][0];
1018
+ if (!segments.every((s) => s[0] === firstSeg)) return null;
1019
+ const exts = positives.map(pathExtension);
1020
+ const firstExt = exts[0];
1021
+ if (!firstExt || !exts.every((e) => e === firstExt)) return null;
1022
+ const pattern = new RegExp(
1023
+ `^\\/${escapeRegex(firstSeg)}\\/.*${escapeRegex(firstExt)}$`,
1024
+ "i"
1025
+ );
1026
+ const validation = validateCandidate(pattern, positives, negatives);
1027
+ if (!validation.ok) return null;
1028
+ return {
1029
+ pattern,
1030
+ strategy: "extension-segment",
1031
+ positives,
1032
+ confidence: validation.confidence
1033
+ };
1034
+ }
1035
+ function inferProductPattern(positives, negatives) {
1036
+ if (positives.length === 0) return null;
1037
+ return tryStrategySharedSegment(positives, negatives) ?? tryStrategyExtensionDepth(positives, negatives) ?? tryStrategyExtensionSegment(positives, negatives) ?? tryStrategyExtensionSuffix(positives, negatives) ?? null;
1038
+ }
1039
+ async function discoverProductPattern(urls, fetchAndExtract, options = {}) {
1040
+ const sampleSize = options.sampleSize ?? 25;
1041
+ const minPositives = options.minPositives ?? 3;
1042
+ const concurrency = Math.max(1, options.concurrency ?? 5);
1043
+ const onProgress = options.onProgress;
1044
+ const samples = selectSampleUrls(urls, sampleSize);
1045
+ if (samples.length === 0) return null;
1046
+ const positives = [];
1047
+ const negatives = [];
1048
+ let done = 0;
1049
+ for (let i = 0; i < samples.length; i += concurrency) {
1050
+ const batch = samples.slice(i, i + concurrency);
1051
+ const results = await Promise.allSettled(
1052
+ batch.map(async (u) => ({ url: u.loc, isProduct: await fetchAndExtract(u.loc) }))
1053
+ );
1054
+ for (const result of results) {
1055
+ done++;
1056
+ if (result.status !== "fulfilled") continue;
1057
+ const { url, isProduct } = result.value;
1058
+ let pathname;
1059
+ try {
1060
+ pathname = new URL(url).pathname;
1061
+ } catch {
1062
+ continue;
1063
+ }
1064
+ if (isProduct) {
1065
+ positives.push(pathname);
1066
+ } else {
1067
+ negatives.push(pathname);
1068
+ }
1069
+ }
1070
+ if (onProgress) onProgress(done, samples.length, positives.length);
1071
+ }
1072
+ if (positives.length < minPositives) return null;
1073
+ return inferProductPattern(positives, negatives);
1074
+ }
1075
+
898
1076
  // src/utils/jsonld-extractor.ts
899
1077
  function extractJsonLdBlocks(html) {
900
1078
  const blocks = [];
@@ -1524,8 +1702,9 @@ async function runCrawl(url, options) {
1524
1702
  sitemapSpinner.succeed(`Found ${allSitemapUrls.length} URLs in sitemap(s)`);
1525
1703
  const productPattern = options.productPattern ? new RegExp(options.productPattern, "i") : void 0;
1526
1704
  let productUrls = filterProductUrls(allSitemapUrls, productPattern);
1527
- if (robots) {
1528
- productUrls = productUrls.filter((u) => {
1705
+ const applyRobotsFilter = (candidates) => {
1706
+ if (!robots) return candidates;
1707
+ return candidates.filter((u) => {
1529
1708
  try {
1530
1709
  const urlPath = new URL(u.loc).pathname;
1531
1710
  return isPathAllowed(robots, urlPath, "OCPBot");
@@ -1533,6 +1712,41 @@ async function runCrawl(url, options) {
1533
1712
  return true;
1534
1713
  }
1535
1714
  });
1715
+ };
1716
+ productUrls = applyRobotsFilter(productUrls);
1717
+ const SAMPLING_MIN_MATCHES = 20;
1718
+ if (productUrls.length < SAMPLING_MIN_MATCHES && !options.productPattern && allSitemapUrls.length > 0) {
1719
+ const sampleSpinner = ora(
1720
+ `Only ${productUrls.length} URLs matched default product patterns \u2014 sampling to infer site layout...`
1721
+ ).start();
1722
+ const inference = await discoverProductPattern(
1723
+ allSitemapUrls,
1724
+ async (pageUrl) => {
1725
+ const resp = await client.get(pageUrl);
1726
+ if (!resp.ok) throw new Error(`fetch failed: ${resp.status}`);
1727
+ return extractProductsFromHtml(resp.text, pageUrl).length > 0;
1728
+ },
1729
+ {
1730
+ sampleSize: 25,
1731
+ minPositives: 5,
1732
+ concurrency: options.concurrency,
1733
+ onProgress: (done, total, found) => {
1734
+ sampleSpinner.text = `Sampling pages to find products... ${done}/${total} (${found} products found)`;
1735
+ }
1736
+ }
1737
+ );
1738
+ if (inference) {
1739
+ sampleSpinner.succeed(
1740
+ `Inferred product pattern: ${inference.pattern} (strategy: ${inference.strategy}, ${inference.positives.length} positives)`
1741
+ );
1742
+ productUrls = applyRobotsFilter(filterProductUrls(allSitemapUrls, inference.pattern));
1743
+ } else {
1744
+ sampleSpinner.fail("Could not infer a confident product URL pattern.");
1745
+ console.log(import_chalk5.default.yellow("\n Re-run with --product-pattern '<regex>' to specify the pattern manually."));
1746
+ console.log(import_chalk5.default.dim(" Example: --product-pattern '\\.html$'"));
1747
+ console.log(import_chalk5.default.dim(" If your product pages require JavaScript to render, the crawler cannot extract their data."));
1748
+ return;
1749
+ }
1536
1750
  }
1537
1751
  if (options.maxProducts) {
1538
1752
  productUrls = productUrls.slice(0, options.maxProducts);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@opencommerceprotocol/cli",
3
- "version": "1.0.0",
3
+ "version": "1.1.0",
4
4
  "description": "Open Commerce Protocol — CLI tool for generating and validating OCP files",
5
5
  "bin": {
6
6
  "ocp": "./dist/cli.js"
@@ -18,7 +18,7 @@
18
18
  ],
19
19
  "scripts": {
20
20
  "build": "tsup",
21
- "test": "vitest run",
21
+ "test": "vitest run --root ../.. packages/cli",
22
22
  "lint": "eslint src",
23
23
  "typecheck": "tsc --noEmit",
24
24
  "prepublishOnly": "npm run build"
@@ -36,17 +36,16 @@
36
36
  "@types/papaparse": "^5.3.14",
37
37
  "tsup": "^8.0.0",
38
38
  "typescript": "^5.5.0",
39
- "vitest": "^2.0.0",
40
- "@types/node": "^20.0.0",
41
- "jsdom": "^28.1.0"
39
+ "vitest": "^2.0.0"
42
40
  },
43
41
  "repository": {
44
42
  "type": "git",
45
- "url": "https://github.com/OpenCommerceProtocol/cli.git"
43
+ "url": "https://github.com/opencommerceprotocol/ocp",
44
+ "directory": "packages/cli"
46
45
  },
47
46
  "homepage": "https://opencommerceprotocol.org",
48
47
  "bugs": {
49
- "url": "https://github.com/OpenCommerceProtocol/cli/issues"
48
+ "url": "https://github.com/opencommerceprotocol/ocp/issues"
50
49
  },
51
50
  "keywords": [
52
51
  "ocp",