webpeel 0.1.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +118 -34
- package/dist/cli.js +339 -13
- package/dist/cli.js.map +1 -1
- package/dist/core/crawler.d.ts +58 -0
- package/dist/core/crawler.d.ts.map +1 -0
- package/dist/core/crawler.js +205 -0
- package/dist/core/crawler.js.map +1 -0
- package/dist/core/fetcher.d.ts +8 -1
- package/dist/core/fetcher.d.ts.map +1 -1
- package/dist/core/fetcher.js +111 -24
- package/dist/core/fetcher.js.map +1 -1
- package/dist/core/markdown.d.ts +5 -0
- package/dist/core/markdown.d.ts.map +1 -1
- package/dist/core/markdown.js +50 -22
- package/dist/core/markdown.js.map +1 -1
- package/dist/core/strategies.d.ts +14 -3
- package/dist/core/strategies.d.ts.map +1 -1
- package/dist/core/strategies.js +44 -8
- package/dist/core/strategies.js.map +1 -1
- package/dist/index.d.ts +22 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +70 -7
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.js +308 -8
- package/dist/mcp/server.js.map +1 -1
- package/dist/types.d.ts +18 -2
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js.map +1 -1
- package/llms.txt +1 -1
- package/package.json +7 -3
package/dist/cli.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AAEA;;;;;;;;;;;;GAYG;AAEH,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,GAAG,MAAM,KAAK,CAAC;AACtB,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,MAAM,YAAY,CAAC;AAG3C,MAAM,OAAO,GAAG,IAAI,OAAO,EAAE,CAAC;AAE9B,OAAO;KACJ,IAAI,CAAC,SAAS,CAAC;KACf,WAAW,CAAC,gCAAgC,CAAC;KAC7C,OAAO,CAAC,OAAO,CAAC,CAAC;AAEpB,OAAO;KACJ,QAAQ,CAAC,OAAO,EAAE,cAAc,CAAC;KACjC,MAAM,CAAC,cAAc,EAAE,2CAA2C,CAAC;KACnE,MAAM,CAAC,iBAAiB,EAAE,gCAAgC,EAAE,QAAQ,CAAC;KACrE,MAAM,CAAC,QAAQ,EAAE,qCAAqC,CAAC;KACvD,MAAM,CAAC,QAAQ,EAAE,uCAAuC,CAAC;KACzD,MAAM,CAAC,QAAQ,EAAE,gBAAgB,CAAC;KAClC,MAAM,CAAC,oBAAoB,EAAE,sBAAsB,EAAE,QAAQ,EAAE,KAAK,CAAC;KACrE,MAAM,CAAC,cAAc,EAAE,mBAAmB,CAAC;KAC3C,MAAM,CAAC,cAAc,EAAE,0BAA0B,CAAC;KAClD,MAAM,CAAC,KAAK,EAAE,GAAuB,EAAE,OAAO,EAAE,EAAE;IACjD,IAAI,CAAC,GAAG,EAAE,CAAC;QACT,OAAO,CAAC,KAAK,CAAC,0BAA0B,CAAC,CAAC;QAC1C,OAAO,CAAC,IAAI,EAAE,CAAC;QACf,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,oCAAoC;IACpC,IAAI,GAAG,CAAC,MAAM,GAAG,IAAI,EAAE,CAAC;QACtB,OAAO,CAAC,KAAK,CAAC,2CAA2C,CAAC,CAAC;QAC3D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,+BAA+B;IAC/B,IAAI,iBAAiB,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;QAChC,OAAO,CAAC,KAAK,CAAC,gDAAgD,CAAC,CAAC;QAChE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,sBAAsB;IACtB,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAC5B,IAAI,CAAC,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,QAAQ,CAAC,EAAE,CAAC;YACnD,OAAO,CAAC,KAAK,CAAC,kDAAkD,CAAC,CAAC;YAClE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,CAAC,KAAK,CAAC,8BAA8B,GAAG,EAAE,CAAC,CAAC;QACnD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,MAAM,OAAO,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,aAAa,CAAC,CAAC,KAAK,EAAE,CAAC;IAEnE,IAAI,CAAC;QACH,mBAAmB;QACnB,IAAI,OAAO,CAAC,IAAI,IAAI,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC,IAAI,OAAO,CAAC,IAAI,GAAG,KAAK,CAAC,EAAE,CAAC;YAC/D,OAAO,CAAC,KAAK,CAAC,gDAAgD,CAAC,CAAC;YAChE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;QAED,qBAAqB;QACrB,MAAM,WAAW,GAAgB;YAC/B,MAAM,EAAE,OAAO,CAAC,MAAM,IAAI,KAAK;YAC/B,IAAI,EAAE,OAAO,CAAC,IAAI,IAAI,CAAC;YACvB,OAAO,EAAE,OAAO,CAAC,OAAO;YACxB,SAAS,EAAE,OAAO,CAAC,EAAE;SACtB,CAAC;QAEF,mBAAmB;QACnB,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC;YACjB,WAAW,CAAC,MAAM,GAAG,MAAM,CAAC;QAC9B,CAAC;aAAM,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC;YACxB,WAAW,CAAC,MAAM,GAAG,MAAM,CAAC;QAC9B,CAAC;aAAM,CAAC;YACN,WAAW,CAAC,MAAM,GAAG,UAAU,CAAC;QAClC,CAAC;QAED,iBAAiB;QACjB,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,GAAG,EAAE,WAAW,CAAC,CAAC;QAE5C,IAAI,OAAO,EAAE,CAAC;YACZ,OAAO,CAAC,OAAO,CAAC,cAAc,MAAM,CAAC,OAAO,YAAY,MAAM,CAAC,MAAM,SAAS,CAAC,CAAC;QAClF,CAAC;QAED,iBAAiB;QACjB,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC;YACjB,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;QAC/C,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC9B,CAAC;QAED,oBAAoB;QACpB,MAAM,OAAO,EAAE,CAAC;QAChB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,OAAO,EAAE,CAAC;YACZ,OAAO,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC;QAClC,CAAC;QAED,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;YAC3B,OAAO,CAAC,KAAK,CAAC,YAAY,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;QAC7C,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,KAAK,CAAC,iCAAiC,CAAC,CAAC;QACnD,CAAC;QAED,MAAM,OAAO,EAAE,CAAC;QAChB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL,kBAAkB;AAClB,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,QAAQ,CAAC,SAAS,EAAE,cAAc,CAAC;KACnC,WAAW,CAAC,kCAAkC,CAAC;KAC/C,MAAM,CAAC,GAAG,EAAE;IACX,OAAO,CAAC,GAAG,CAAC,oCAAoC,CAAC,CAAC;IAClD,OAAO,CAAC,GAAG,CAAC,4CAA4C,CAAC,CAAC;IAC1D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AAClB,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,OAAO,CAAC;KAChB,WAAW,CAAC,kBAAkB,CAAC;KAC/B,MAAM,CAAC,mBAAmB,EAAE,aAAa,EAAE,MAAM,CAAC;KAClD,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,MAAM,CAAC,iBAAiB,CAAC,CAAC;IACxD,WAAW,CAAC,EAAE,IAAI,EAAE,QAAQ,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,EAAE,CAAC,CAAC;AACpD,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,KAAK,CAAC;KACd,WAAW,CAAC,8CAA8C,CAAC;KAC3D,MAAM,CAAC,KAAK,IAAI,EAAE;IACjB,MAAM,MAAM,CAAC,iBAAiB,CAAC,CAAC;AAClC,CAAC,CAAC,CAAC;AAEL,OAAO,CAAC,KAAK,EAAE,CAAC"}
|
|
1
|
+
{"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AAEA;;;;;;;;;;;;GAYG;AAEH,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,GAAG,MAAM,KAAK,CAAC;AACtB,OAAO,EAAE,aAAa,EAAE,MAAM,IAAI,CAAC;AACnC,OAAO,EAAE,IAAI,EAAE,SAAS,EAAE,OAAO,EAAE,MAAM,YAAY,CAAC;AAGtD,MAAM,OAAO,GAAG,IAAI,OAAO,EAAE,CAAC;AAE9B,OAAO;KACJ,IAAI,CAAC,SAAS,CAAC;KACf,WAAW,CAAC,gCAAgC,CAAC;KAC7C,OAAO,CAAC,OAAO,CAAC;KAChB,uBAAuB,EAAE,CAAC;AAE7B,OAAO;KACJ,QAAQ,CAAC,OAAO,EAAE,cAAc,CAAC;KACjC,MAAM,CAAC,cAAc,EAAE,2CAA2C,CAAC;KACnE,MAAM,CAAC,WAAW,EAAE,kEAAkE,CAAC;KACvF,MAAM,CAAC,iBAAiB,EAAE,gCAAgC,EAAE,QAAQ,CAAC;KACrE,MAAM,CAAC,QAAQ,EAAE,qCAAqC,CAAC;KACvD,MAAM,CAAC,QAAQ,EAAE,uCAAuC,CAAC;KACzD,MAAM,CAAC,QAAQ,EAAE,gBAAgB,CAAC;KAClC,MAAM,CAAC,oBAAoB,EAAE,sBAAsB,EAAE,QAAQ,EAAE,KAAK,CAAC;KACrE,MAAM,CAAC,cAAc,EAAE,mBAAmB,CAAC;KAC3C,MAAM,CAAC,cAAc,EAAE,0BAA0B,CAAC;KAClD,MAAM,CAAC,qBAAqB,EAAE,kDAAkD,CAAC;KACjF,MAAM,CAAC,aAAa,EAAE,8CAA8C,CAAC;KACrE,MAAM,CAAC,kBAAkB,EAAE,uDAAuD,CAAC;KACnF,MAAM,CAAC,0BAA0B,EAAE,oDAAoD,CAAC;KACxF,MAAM,CAAC,0BAA0B,EAAE,sDAAsD,CAAC;KAC1F,MAAM,CAAC,sBAAsB,EAAE,yCAAyC,CAAC;KACzE,MAAM,CAAC,KAAK,EAAE,GAAuB,EAAE,OAAO,EAAE,EAAE;IACjD,IAAI,CAAC,GAAG,EAAE,CAAC;QACT,OAAO,CAAC,KAAK,CAAC,0BAA0B,CAAC,CAAC;QAC1C,OAAO,CAAC,IAAI,EAAE,CAAC;QACf,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,oCAAoC;IACpC,IAAI,GAAG,CAAC,MAAM,GAAG,IAAI,EAAE,CAAC;QACtB,OAAO,CAAC,KAAK,CAAC,2CAA2C,CAAC,CAAC;QAC3D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,+BAA+B;IAC/B,IAAI,iBAAiB,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;QAChC,OAAO,CAAC,KAAK,CAAC,gDAAgD,CAAC,CAAC;QAChE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,sBAAsB;IACtB,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAC5B,IAAI,CAAC,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,QAAQ,CAAC,EAAE,CAAC;YACnD,OAAO,CAAC,KAAK,CAAC,kDAAkD,CAAC,CAAC;YAClE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,CAAC,KAAK,CAAC,8BAA8B,GAAG,EAAE,CAAC,CAAC;QACnD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,MAAM,OAAO,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,aAAa,CAAC,CAAC,KAAK,EAAE,CAAC;IAEnE,IAAI,CAAC;QACH,mBAAmB;QACnB,IAAI,OAAO,CAAC,IAAI,IAAI,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC,IAAI,OAAO,CAAC,IAAI,GAAG,KAAK,CAAC,EAAE,CAAC;YAC/D,OAAO,CAAC,KAAK,CAAC,gDAAgD,CAAC,CAAC;YAChE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;QAED,uBAAuB;QACvB,IAAI,OAA2C,CAAC;QAChD,IAAI,OAAO,CAAC,MAAM,IAAI,OAAO,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAChD,OAAO,GAAG,EAAE,CAAC;YACb,KAAK,MAAM,MAAM,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;gBACpC,MAAM,UAAU,GAAG,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;gBACvC,IAAI,UAAU,KAAK,CAAC,CAAC,EAAE,CAAC;oBACtB,OAAO,CAAC,KAAK,CAAC,iCAAiC,MAAM,EAAE,CAAC,CAAC;oBACzD,OAAO,CAAC,KAAK,CAAC,+BAA+B,CAAC,CAAC;oBAC/C,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;gBAClB,CAAC;gBACD,MAAM,GAAG,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,UAAU,CAAC,CAAC,IAAI,EAAE,CAAC;gBAC/C,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,UAAU,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;gBAClD,OAAO,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC;YACvB,CAAC;QACH,CAAC;QAED,qBAAqB;QACrB,MAAM,WAAW,GAAgB;YAC/B,MAAM,EAAE,OAAO,CAAC,MAAM,IAAI,KAAK;YAC/B,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,KAAK;YACjC,IAAI,EAAE,OAAO,CAAC,IAAI,IAAI,CAAC;YACvB,OAAO,EAAE,OAAO,CAAC,OAAO;YACxB,SAAS,EAAE,OAAO,CAAC,EAAE;YACrB,UAAU,EAAE,OAAO,CAAC,UAAU,KAAK,SAAS;YAC5C,kBAAkB,EAAE,OAAO,CAAC,QAAQ,IAAI,KAAK;YAC7C,QAAQ,EAAE,OAAO,CAAC,QAAQ;YAC1B,OAAO,EAAE,OAAO,CAAC,OAAO;YACxB,OAAO;YACP,OAAO,EAAE,OAAO,CAAC,MAAM;SACxB,CAAC;QAEF,mBAAmB;QACnB,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC;YACjB,WAAW,CAAC,MAAM,GAAG,MAAM,CAAC;QAC9B,CAAC;aAAM,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC;YACxB,WAAW,CAAC,MAAM,GAAG,MAAM,CAAC;QAC9B,CAAC;aAAM,CAAC;YACN,WAAW,CAAC,MAAM,GAAG,UAAU,CAAC;QAClC,CAAC;QAED,iBAAiB;QACjB,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,GAAG,EAAE,WAAW,CAAC,CAAC;QAE5C,IAAI,OAAO,EAAE,CAAC;YACZ,OAAO,CAAC,OAAO,CAAC,cAAc,MAAM,CAAC,OAAO,YAAY,MAAM,CAAC,MAAM,SAAS,CAAC,CAAC;QAClF,CAAC;QAED,2BAA2B;QAC3B,IAAI,OAAO,CAAC,UAAU,IAAI,MAAM,CAAC,UAAU,EAAE,CAAC;YAC5C,MAAM,cAAc,GAAG,OAAO,OAAO,CAAC,UAAU,KAAK,QAAQ;gBAC3D,CAAC,CAAC,OAAO,CAAC,UAAU;gBACpB,CAAC,CAAC,gBAAgB,CAAC;YAErB,MAAM,gBAAgB,GAAG,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,UAAU,EAAE,QAAQ,CAAC,CAAC;YAClE,aAAa,CAAC,cAAc,EAAE,gBAAgB,CAAC,CAAC;YAEhD,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC;gBACpB,OAAO,CAAC,KAAK,CAAC,wBAAwB,cAAc,EAAE,CAAC,CAAC;YAC1D,CAAC;YAED,uDAAuD;YACvD,IAAI,OAAO,OAAO,CAAC,UAAU,KAAK,QAAQ,EAAE,CAAC;gBAC3C,OAAO,MAAM,CAAC,UAAU,CAAC;YAC3B,CAAC;QACH,CAAC;QAED,6CAA6C;QAC7C,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC;YACjB,MAAM,OAAO,GAAG,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;YAChD,MAAM,IAAI,OAAO,CAAO,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;gBAC1C,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,OAAO,GAAG,IAAI,EAAE,CAAC,GAAG,EAAE,EAAE;oBAC3C,IAAI,GAAG;wBAAE,MAAM,CAAC,GAAG,CAAC,CAAC;;wBAChB,OAAO,EAAE,CAAC;gBACjB,CAAC,CAAC,CAAC;YACL,CAAC,CAAC,CAAC;QACL,CAAC;aAAM,CAAC;YACN,MAAM,IAAI,OAAO,CAAO,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;gBAC1C,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,OAAO,GAAG,IAAI,EAAE,CAAC,GAAG,EAAE,EAAE;oBAClD,IAAI,GAAG;wBAAE,MAAM,CAAC,GAAG,CAAC,CAAC;;wBAChB,OAAO,EAAE,CAAC;gBACjB,CAAC,CAAC,CAAC;YACL,CAAC,CAAC,CAAC;QACL,CAAC;QAED,oBAAoB;QACpB,MAAM,OAAO,EAAE,CAAC;QAChB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,OAAO,EAAE,CAAC;YACZ,OAAO,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC;QAClC,CAAC;QAED,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;YAC3B,OAAO,CAAC,KAAK,CAAC,YAAY,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;QAC7C,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,KAAK,CAAC,iCAAiC,CAAC,CAAC;QACnD,CAAC;QAED,MAAM,OAAO,EAAE,CAAC;QAChB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL,iBAAiB;AACjB,OAAO;KACJ,OAAO,CAAC,gBAAgB,CAAC;KACzB,WAAW,CAAC,yBAAyB,CAAC;KACtC,MAAM,CAAC,iBAAiB,EAAE,0BAA0B,EAAE,GAAG,CAAC;KAC1D,MAAM,CAAC,QAAQ,EAAE,gBAAgB,CAAC;KAClC,MAAM,CAAC,cAAc,EAAE,aAAa,CAAC;KACrC,MAAM,CAAC,KAAK,EAAE,KAAa,EAAE,OAAO,EAAE,EAAE;IACvC,MAAM,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAC5B,MAAM,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC;IAChC,MAAM,KAAK,GAAG,QAAQ,CAAC,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAE3C,MAAM,OAAO,GAAG,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,cAAc,CAAC,CAAC,KAAK,EAAE,CAAC;IAE9D,IAAI,CAAC;QACH,yCAAyC;QACzC,MAAM,EAAE,KAAK,EAAE,WAAW,EAAE,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,CAAC;QACtD,MAAM,EAAE,IAAI,EAAE,GAAG,MAAM,MAAM,CAAC,SAAS,CAAC,CAAC;QAEzC,MAAM,SAAS,GAAG,uCAAuC,kBAAkB,CAAC,KAAK,CAAC,EAAE,CAAC;QAErF,MAAM,QAAQ,GAAG,MAAM,WAAW,CAAC,SAAS,EAAE;YAC5C,OAAO,EAAE;gBACP,YAAY,EAAE,oEAAoE;aACnF;SACF,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;YACjB,MAAM,IAAI,KAAK,CAAC,uBAAuB,QAAQ,CAAC,MAAM,EAAE,CAAC,CAAC;QAC5D,CAAC;QAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;QACnC,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC;QAErB,MAAM,OAAO,GAA2D,EAAE,CAAC;QAE3E,CAAC,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,IAAI,EAAE,EAAE;YAC7B,IAAI,OAAO,CAAC,MAAM,IAAI,KAAK;gBAAE,OAAO;YAEpC,MAAM,OAAO,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC;YACxB,MAAM,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;YAC3D,MAAM,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC;YAC7D,MAAM,OAAO,GAAG,OAAO,CAAC,IAAI,CAAC,kBAAkB,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;YAE/D,IAAI,CAAC,KAAK,IAAI,CAAC,MAAM;gBAAE,OAAO;YAE9B,8CAA8C;YAC9C,IAAI,GAAG,GAAG,MAAM,CAAC;YACjB,IAAI,CAAC;gBACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,MAAM,EAAE,wBAAwB,CAAC,CAAC;gBACzD,MAAM,IAAI,GAAG,MAAM,CAAC,YAAY,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;gBAC7C,IAAI,IAAI,EAAE,CAAC;oBACT,GAAG,GAAG,kBAAkB,CAAC,IAAI,CAAC,CAAC;gBACjC,CAAC;YACH,CAAC;YAAC,MAAM,CAAC;gBACP,+BAA+B;YACjC,CAAC;YAED,qBAAqB;YACrB,IAAI,CAAC;gBACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;gBAC5B,IAAI,CAAC,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,QAAQ,CAAC,EAAE,CAAC;oBACnD,OAAO;gBACT,CAAC;gBACD,GAAG,GAAG,MAAM,CAAC,IAAI,CAAC;YACpB,CAAC;YAAC,MAAM,CAAC;gBACP,OAAO;YACT,CAAC;YAED,OAAO,CAAC,IAAI,CAAC;gBACX,KAAK,EAAE,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC;gBAC1B,GAAG;gBACH,OAAO,EAAE,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC;aAC/B,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,IAAI,OAAO,EAAE,CAAC;YACZ,OAAO,CAAC,OAAO,CAAC,SAAS,OAAO,CAAC,MAAM,UAAU,CAAC,CAAC;QACrD,CAAC;QAED,IAAI,MAAM,EAAE,CAAC;YACX,MAAM,OAAO,GAAG,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;YACjD,MAAM,IAAI,OAAO,CAAO,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;gBAC1C,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,OAAO,GAAG,IAAI,EAAE,CAAC,GAAG,EAAE,EAAE;oBAC3C,IAAI,GAAG;wBAAE,MAAM,CAAC,GAAG,CAAC,CAAC;;wBAChB,OAAO,EAAE,CAAC;gBACjB,CAAC,CAAC,CAAC;YACL,CAAC,CAAC,CAAC;QACL,CAAC;aAAM,CAAC;YACN,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;gBAC7B,OAAO,CAAC,GAAG,CAAC,KAAK,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;gBACjC,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;gBACxB,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;YAC9B,CAAC;QACH,CAAC;QAED,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,OAAO,EAAE,CAAC;YACZ,OAAO,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;QAChC,CAAC;QAED,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;YAC3B,OAAO,CAAC,KAAK,CAAC,YAAY,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;QAC7C,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,KAAK,CAAC,iCAAiC,CAAC,CAAC;QACnD,CAAC;QAED,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL,gBAAgB;AAChB,OAAO;KACJ,OAAO,CAAC,cAAc,CAAC;KACvB,WAAW,CAAC,qBAAqB,CAAC;KAClC,MAAM,CAAC,uBAAuB,EAAE,qCAAqC,EAAE,GAAG,CAAC;KAC3E,MAAM,CAAC,oBAAoB,EAAE,qCAAqC,CAAC;KACnE,MAAM,CAAC,QAAQ,EAAE,sBAAsB,CAAC;KACxC,MAAM,CAAC,cAAc,EAAE,aAAa,CAAC;KACrC,MAAM,CAAC,cAAc,EAAE,sBAAsB,CAAC;KAC9C,MAAM,CAAC,kBAAkB,EAAE,yBAAyB,CAAC;KACrD,MAAM,CAAC,KAAK,EAAE,IAAY,EAAE,OAAO,EAAE,EAAE;IACtC,MAAM,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAC5B,MAAM,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC;IAChC,MAAM,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC;IACpC,MAAM,QAAQ,GAAG,OAAO,CAAC,QAAQ,CAAC;IAElC,MAAM,OAAO,GAAG,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,iBAAiB,CAAC,CAAC,KAAK,EAAE,CAAC;IAEjE,IAAI,CAAC;QACH,MAAM,EAAE,YAAY,EAAE,GAAG,MAAM,MAAM,CAAC,IAAI,CAAC,CAAC;QAE5C,sBAAsB;QACtB,IAAI,IAAc,CAAC;QACnB,IAAI,CAAC;YACH,MAAM,OAAO,GAAG,YAAY,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;YAC5C,IAAI,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC;iBACvB,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;iBACxB,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC;QACnD,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,KAAK,CAAC,wBAAwB,IAAI,EAAE,CAAC,CAAC;QAClD,CAAC;QAED,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACtB,MAAM,IAAI,KAAK,CAAC,uBAAuB,CAAC,CAAC;QAC3C,CAAC;QAED,IAAI,OAAO,EAAE,CAAC;YACZ,OAAO,CAAC,IAAI,GAAG,YAAY,IAAI,CAAC,MAAM,uBAAuB,OAAO,CAAC,WAAW,MAAM,CAAC;QACzF,CAAC;QAED,cAAc;QACd,MAAM,OAAO,GAAG,MAAM,SAAS,CAAC,IAAI,EAAE;YACpC,WAAW,EAAE,QAAQ,CAAC,OAAO,CAAC,WAAW,CAAC,IAAI,CAAC;YAC/C,MAAM,EAAE,YAAY;YACpB,QAAQ,EAAE,QAAQ;SACnB,CAAC,CAAC;QAEH,IAAI,OAAO,EAAE,CAAC;YACZ,MAAM,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,SAAS,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC;YAChE,OAAO,CAAC,OAAO,CAAC,cAAc,YAAY,IAAI,IAAI,CAAC,MAAM,aAAa,CAAC,CAAC;QAC1E,CAAC;QAED,iBAAiB;QACjB,IAAI,MAAM,EAAE,CAAC;YACX,MAAM,OAAO,GAAG,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;YACjD,MAAM,IAAI,OAAO,CAAO,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;gBAC1C,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,OAAO,GAAG,IAAI,EAAE,CAAC,GAAG,EAAE,EAAE;oBAC3C,IAAI,GAAG;wBAAE,MAAM,CAAC,GAAG,CAAC,CAAC;;wBAChB,OAAO,EAAE,CAAC;gBACjB,CAAC,CAAC,CAAC;YACL,CAAC,CAAC,CAAC;QACL,CAAC;aAAM,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;YAC1B,MAAM,EAAE,aAAa,EAAE,SAAS,EAAE,GAAG,MAAM,MAAM,CAAC,IAAI,CAAC,CAAC;YACxD,MAAM,EAAE,IAAI,EAAE,GAAG,MAAM,MAAM,CAAC,MAAM,CAAC,CAAC;YAEtC,0BAA0B;YAC1B,SAAS,CAAC,OAAO,CAAC,MAAM,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;YAE/C,OAAO,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE;gBAC5B,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;gBAChC,MAAM,QAAQ,GAAG,GAAG,CAAC,GAAG,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,aAAa,EAAE,GAAG,CAAC,KAAK,CAAC;gBAC9E,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,QAAQ,CAAC,CAAC;gBAEhD,IAAI,SAAS,IAAI,MAAM,EAAE,CAAC;oBACxB,aAAa,CAAC,QAAQ,EAAE,MAAM,CAAC,OAAO,CAAC,CAAC;gBAC1C,CAAC;qBAAM,CAAC;oBACN,aAAa,CAAC,QAAQ,EAAE,UAAU,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;gBACpD,CAAC;YACH,CAAC,CAAC,CAAC;YAEH,IAAI,CAAC,QAAQ,EAAE,CAAC;gBACd,OAAO,CAAC,GAAG,CAAC,uBAAuB,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC;YACvD,CAAC;QACH,CAAC;aAAM,CAAC;YACN,0BAA0B;YAC1B,OAAO,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE;gBAC5B,OAAO,CAAC,GAAG,CAAC,SAAS,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC;gBACtC,IAAI,SAAS,IAAI,MAAM,EAAE,CAAC;oBACxB,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,GAAG,KAAK,CAAC,CAAC;gBACpD,CAAC;qBAAM,CAAC;oBACN,OAAO,CAAC,GAAG,CAAC,UAAU,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;gBACxC,CAAC;YACH,CAAC,CAAC,CAAC;QACL,CAAC;QAED,MAAM,OAAO,EAAE,CAAC;QAChB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,OAAO,EAAE,CAAC;YACZ,OAAO,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC;QACrC,CAAC;QAED,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;YAC3B,OAAO,CAAC,KAAK,CAAC,YAAY,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;QAC7C,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,KAAK,CAAC,iCAAiC,CAAC,CAAC;QACnD,CAAC;QAED,MAAM,OAAO,EAAE,CAAC;QAChB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,aAAa,CAAC;KACtB,WAAW,CAAC,qCAAqC,CAAC;KAClD,MAAM,CAAC,sBAAsB,EAAE,0DAA0D,EAAE,QAAQ,EAAE,EAAE,CAAC;KACxG,MAAM,CAAC,sBAAsB,EAAE,6CAA6C,EAAE,QAAQ,EAAE,CAAC,CAAC;KAC1F,MAAM,CAAC,gCAAgC,EAAE,0DAA0D,CAAC;KACpG,MAAM,CAAC,yBAAyB,EAAE,4CAA4C,CAAC;KAC/E,MAAM,CAAC,iBAAiB,EAAE,iDAAiD,CAAC;KAC5E,MAAM,CAAC,mBAAmB,EAAE,mDAAmD,EAAE,QAAQ,EAAE,IAAI,CAAC;KAChG,MAAM,CAAC,cAAc,EAAE,oCAAoC,CAAC;KAC5D,MAAM,CAAC,WAAW,EAAE,gCAAgC,CAAC;KACrD,MAAM,CAAC,cAAc,EAAE,0BAA0B,CAAC;KAClD,MAAM,CAAC,QAAQ,EAAE,gBAAgB,CAAC;KAClC,MAAM,CAAC,KAAK,EAAE,GAAW,EAAE,OAAO,EAAE,EAAE;IACrC,MAAM,EAAE,KAAK,EAAE,GAAG,MAAM,MAAM,CAAC,mBAAmB,CAAC,CAAC;IAEpD,MAAM,OAAO,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,aAAa,CAAC,CAAC,KAAK,EAAE,CAAC;IAEnE,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;YAC/B,QAAQ,EAAE,OAAO,CAAC,QAAQ;YAC1B,QAAQ,EAAE,OAAO,CAAC,QAAQ;YAC1B,cAAc,EAAE,OAAO,CAAC,cAAc;YACtC,eAAe,EAAE,OAAO,CAAC,OAAO;YAChC,gBAAgB,EAAE,CAAC,OAAO,CAAC,YAAY;YACvC,WAAW,EAAE,OAAO,CAAC,SAAS;YAC9B,MAAM,EAAE,OAAO,CAAC,MAAM,IAAI,KAAK;YAC/B,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,KAAK;SAClC,CAAC,CAAC;QAEH,IAAI,OAAO,EAAE,CAAC;YACZ,OAAO,CAAC,OAAO,CAAC,WAAW,OAAO,CAAC,MAAM,QAAQ,CAAC,CAAC;QACrD,CAAC;QAED,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC;YACjB,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;QAChD,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE;gBAC5B,OAAO,CAAC,GAAG,CAAC,KAAK,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC;gBACnC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,OAAO,CAAC,MAAM,KAAK,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;gBAC5D,OAAO,CAAC,GAAG,CAAC,QAAQ,MAAM,CAAC,GAAG,EAAE,CAAC,CAAC;gBAClC,OAAO,CAAC,GAAG,CAAC,UAAU,MAAM,CAAC,KAAK,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,WAAW,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;gBACzF,OAAO,CAAC,GAAG,CAAC,gBAAgB,MAAM,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,CAAC;gBACnD,OAAO,CAAC,GAAG,CAAC,YAAY,MAAM,CAAC,OAAO,IAAI,CAAC,CAAC;gBAE5C,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC;oBACjB,OAAO,CAAC,GAAG,CAAC,UAAU,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;gBACxC,CAAC;qBAAM,CAAC;oBACN,OAAO,CAAC,GAAG,CAAC,KAAK,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,GAAG,MAAM,CAAC,QAAQ,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;gBAChG,CAAC;YACH,CAAC,CAAC,CAAC;QACL,CAAC;QAED,MAAM,OAAO,EAAE,CAAC;QAChB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,OAAO,EAAE,CAAC;YACZ,OAAO,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;QAC/B,CAAC;QAED,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;YAC3B,OAAO,CAAC,KAAK,CAAC,YAAY,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;QAC7C,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,KAAK,CAAC,iCAAiC,CAAC,CAAC;QACnD,CAAC;QAED,MAAM,OAAO,EAAE,CAAC;QAChB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,OAAO,CAAC;KAChB,WAAW,CAAC,kBAAkB,CAAC;KAC/B,MAAM,CAAC,mBAAmB,EAAE,aAAa,EAAE,MAAM,CAAC;KAClD,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,MAAM,CAAC,iBAAiB,CAAC,CAAC;IACxD,WAAW,CAAC,EAAE,IAAI,EAAE,QAAQ,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,EAAE,CAAC,CAAC;AACpD,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,KAAK,CAAC;KACd,WAAW,CAAC,8CAA8C,CAAC;KAC3D,MAAM,CAAC,KAAK,IAAI,EAAE;IACjB,MAAM,MAAM,CAAC,iBAAiB,CAAC,CAAC;AAClC,CAAC,CAAC,CAAC;AAEL,OAAO,CAAC,KAAK,EAAE,CAAC"}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Web crawler functionality
|
|
3
|
+
* Crawls a starting URL and follows links matching specified patterns
|
|
4
|
+
*/
|
|
5
|
+
import type { PeelOptions } from '../types.js';
|
|
6
|
+
export interface CrawlOptions extends Omit<PeelOptions, 'format'> {
|
|
7
|
+
/** Maximum number of pages to crawl (default: 10, max: 100) */
|
|
8
|
+
maxPages?: number;
|
|
9
|
+
/** Maximum depth to crawl (default: 2, max: 5) */
|
|
10
|
+
maxDepth?: number;
|
|
11
|
+
/** Only crawl URLs from these domains (default: same domain as starting URL) */
|
|
12
|
+
allowedDomains?: string[];
|
|
13
|
+
/** Exclude URLs matching these patterns (regex strings) */
|
|
14
|
+
excludePatterns?: string[];
|
|
15
|
+
/** Respect robots.txt (default: true) */
|
|
16
|
+
respectRobotsTxt?: boolean;
|
|
17
|
+
/** Rate limit between requests in milliseconds (default: 1000ms = 1 req/sec) */
|
|
18
|
+
rateLimitMs?: number;
|
|
19
|
+
}
|
|
20
|
+
export interface CrawlResult {
|
|
21
|
+
/** URL of the crawled page */
|
|
22
|
+
url: string;
|
|
23
|
+
/** Page title */
|
|
24
|
+
title: string;
|
|
25
|
+
/** Markdown content */
|
|
26
|
+
markdown: string;
|
|
27
|
+
/** All links found on this page (absolute URLs) */
|
|
28
|
+
links: string[];
|
|
29
|
+
/** Depth level (0 = starting URL) */
|
|
30
|
+
depth: number;
|
|
31
|
+
/** Parent URL that linked to this page (null for starting URL) */
|
|
32
|
+
parent: string | null;
|
|
33
|
+
/** Time elapsed fetching this page (ms) */
|
|
34
|
+
elapsed: number;
|
|
35
|
+
/** Error message if page failed to fetch */
|
|
36
|
+
error?: string;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Crawl a website starting from a URL
|
|
40
|
+
*
|
|
41
|
+
* @param startUrl - Starting URL to crawl from
|
|
42
|
+
* @param options - Crawl options
|
|
43
|
+
* @returns Array of crawl results
|
|
44
|
+
*
|
|
45
|
+
* @example
|
|
46
|
+
* ```typescript
|
|
47
|
+
* import { crawl } from 'webpeel';
|
|
48
|
+
*
|
|
49
|
+
* const results = await crawl('https://example.com', {
|
|
50
|
+
* maxPages: 20,
|
|
51
|
+
* maxDepth: 2,
|
|
52
|
+
* });
|
|
53
|
+
*
|
|
54
|
+
* console.log(`Crawled ${results.length} pages`);
|
|
55
|
+
* ```
|
|
56
|
+
*/
|
|
57
|
+
export declare function crawl(startUrl: string, options?: CrawlOptions): Promise<CrawlResult[]>;
|
|
58
|
+
//# sourceMappingURL=crawler.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"crawler.d.ts","sourceRoot":"","sources":["../../src/core/crawler.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAGH,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAG/C,MAAM,WAAW,YAAa,SAAQ,IAAI,CAAC,WAAW,EAAE,QAAQ,CAAC;IAC/D,+DAA+D;IAC/D,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,kDAAkD;IAClD,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,gFAAgF;IAChF,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC;IAC1B,2DAA2D;IAC3D,eAAe,CAAC,EAAE,MAAM,EAAE,CAAC;IAC3B,yCAAyC;IACzC,gBAAgB,CAAC,EAAE,OAAO,CAAC;IAC3B,gFAAgF;IAChF,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED,MAAM,WAAW,WAAW;IAC1B,8BAA8B;IAC9B,GAAG,EAAE,MAAM,CAAC;IACZ,iBAAiB;IACjB,KAAK,EAAE,MAAM,CAAC;IACd,uBAAuB;IACvB,QAAQ,EAAE,MAAM,CAAC;IACjB,mDAAmD;IACnD,KAAK,EAAE,MAAM,EAAE,CAAC;IAChB,qCAAqC;IACrC,KAAK,EAAE,MAAM,CAAC;IACd,kEAAkE;IAClE,MAAM,EAAE,MAAM,GAAG,IAAI,CAAC;IACtB,2CAA2C;IAC3C,OAAO,EAAE,MAAM,CAAC;IAChB,4CAA4C;IAC5C,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAsFD;;;;;;;;;;;;;;;;;;GAkBG;AACH,wBAAsB,KAAK,CACzB,QAAQ,EAAE,MAAM,EAChB,OAAO,GAAE,YAAiB,GACzB,OAAO,CAAC,WAAW,EAAE,CAAC,CAwIxB"}
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Web crawler functionality
|
|
3
|
+
* Crawls a starting URL and follows links matching specified patterns
|
|
4
|
+
*/
|
|
5
|
+
import { peel } from '../index.js';
|
|
6
|
+
import { fetch as undiciFetch } from 'undici';
|
|
7
|
+
/**
|
|
8
|
+
* Parse robots.txt and return disallowed paths for User-agent: *
|
|
9
|
+
*/
|
|
10
|
+
async function fetchRobotsTxt(domain) {
|
|
11
|
+
const robotsUrl = `https://${domain}/robots.txt`;
|
|
12
|
+
try {
|
|
13
|
+
const response = await undiciFetch(robotsUrl, {
|
|
14
|
+
headers: {
|
|
15
|
+
'User-Agent': 'WebPeel/0.3.0 (+https://webpeel.dev)',
|
|
16
|
+
},
|
|
17
|
+
signal: AbortSignal.timeout(5000), // 5 second timeout
|
|
18
|
+
});
|
|
19
|
+
if (!response.ok) {
|
|
20
|
+
// If robots.txt doesn't exist, allow everything
|
|
21
|
+
return { disallowedPaths: [] };
|
|
22
|
+
}
|
|
23
|
+
const text = await response.text();
|
|
24
|
+
const lines = text.split('\n');
|
|
25
|
+
const disallowedPaths = [];
|
|
26
|
+
let crawlDelay;
|
|
27
|
+
let relevantSection = false;
|
|
28
|
+
for (const line of lines) {
|
|
29
|
+
const trimmed = line.trim();
|
|
30
|
+
// Check for User-agent: *
|
|
31
|
+
if (trimmed.toLowerCase().startsWith('user-agent:')) {
|
|
32
|
+
const agent = trimmed.substring('user-agent:'.length).trim();
|
|
33
|
+
relevantSection = agent === '*';
|
|
34
|
+
continue;
|
|
35
|
+
}
|
|
36
|
+
if (!relevantSection)
|
|
37
|
+
continue;
|
|
38
|
+
// Parse Disallow directives
|
|
39
|
+
if (trimmed.toLowerCase().startsWith('disallow:')) {
|
|
40
|
+
const path = trimmed.substring('disallow:'.length).trim();
|
|
41
|
+
if (path) {
|
|
42
|
+
disallowedPaths.push(path);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
// Parse Crawl-delay directive
|
|
46
|
+
if (trimmed.toLowerCase().startsWith('crawl-delay:')) {
|
|
47
|
+
const delay = parseInt(trimmed.substring('crawl-delay:'.length).trim());
|
|
48
|
+
if (!isNaN(delay)) {
|
|
49
|
+
crawlDelay = delay * 1000; // Convert to milliseconds
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
return { disallowedPaths, crawlDelay };
|
|
54
|
+
}
|
|
55
|
+
catch {
|
|
56
|
+
// If we can't fetch robots.txt, allow everything
|
|
57
|
+
return { disallowedPaths: [] };
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Check if a URL is allowed by robots.txt rules
|
|
62
|
+
*/
|
|
63
|
+
function isAllowedByRobots(url, rules) {
|
|
64
|
+
const urlObj = new URL(url);
|
|
65
|
+
const path = urlObj.pathname;
|
|
66
|
+
for (const disallowed of rules.disallowedPaths) {
|
|
67
|
+
// Simple prefix matching (proper robots.txt parsing would handle wildcards)
|
|
68
|
+
if (path.startsWith(disallowed)) {
|
|
69
|
+
return false;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
return true;
|
|
73
|
+
}
|
|
74
|
+
/**
|
|
75
|
+
* Crawl a website starting from a URL
|
|
76
|
+
*
|
|
77
|
+
* @param startUrl - Starting URL to crawl from
|
|
78
|
+
* @param options - Crawl options
|
|
79
|
+
* @returns Array of crawl results
|
|
80
|
+
*
|
|
81
|
+
* @example
|
|
82
|
+
* ```typescript
|
|
83
|
+
* import { crawl } from 'webpeel';
|
|
84
|
+
*
|
|
85
|
+
* const results = await crawl('https://example.com', {
|
|
86
|
+
* maxPages: 20,
|
|
87
|
+
* maxDepth: 2,
|
|
88
|
+
* });
|
|
89
|
+
*
|
|
90
|
+
* console.log(`Crawled ${results.length} pages`);
|
|
91
|
+
* ```
|
|
92
|
+
*/
|
|
93
|
+
export async function crawl(startUrl, options = {}) {
|
|
94
|
+
const { maxPages = 10, maxDepth = 2, allowedDomains, excludePatterns = [], respectRobotsTxt = true, rateLimitMs = 1000, ...peelOptions } = options;
|
|
95
|
+
// Validate limits
|
|
96
|
+
const validatedMaxPages = Math.min(Math.max(maxPages, 1), 100);
|
|
97
|
+
const validatedMaxDepth = Math.min(Math.max(maxDepth, 1), 5);
|
|
98
|
+
const validatedRateLimit = Math.max(rateLimitMs, 100); // Min 100ms between requests
|
|
99
|
+
// Parse starting URL
|
|
100
|
+
const startUrlObj = new URL(startUrl);
|
|
101
|
+
const startDomain = startUrlObj.hostname;
|
|
102
|
+
// Default: only crawl same domain as starting URL
|
|
103
|
+
const validatedAllowedDomains = allowedDomains && allowedDomains.length > 0
|
|
104
|
+
? allowedDomains
|
|
105
|
+
: [startDomain];
|
|
106
|
+
// Compile exclude patterns
|
|
107
|
+
const excludeRegexes = excludePatterns.map(pattern => new RegExp(pattern));
|
|
108
|
+
// Fetch robots.txt if needed
|
|
109
|
+
let robotsRules = { disallowedPaths: [] };
|
|
110
|
+
if (respectRobotsTxt) {
|
|
111
|
+
robotsRules = await fetchRobotsTxt(startDomain);
|
|
112
|
+
// Use crawl-delay from robots.txt if it's larger than our rate limit
|
|
113
|
+
if (robotsRules.crawlDelay && robotsRules.crawlDelay > validatedRateLimit) {
|
|
114
|
+
console.error(`[Crawler] Using Crawl-delay from robots.txt: ${robotsRules.crawlDelay}ms`);
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
const effectiveRateLimit = robotsRules.crawlDelay || validatedRateLimit;
|
|
118
|
+
// State tracking
|
|
119
|
+
const results = [];
|
|
120
|
+
const visited = new Set();
|
|
121
|
+
const queue = [
|
|
122
|
+
{ url: startUrl, depth: 0, parent: null },
|
|
123
|
+
];
|
|
124
|
+
while (queue.length > 0 && results.length < validatedMaxPages) {
|
|
125
|
+
const item = queue.shift();
|
|
126
|
+
const { url, depth, parent } = item;
|
|
127
|
+
// Skip if already visited
|
|
128
|
+
if (visited.has(url))
|
|
129
|
+
continue;
|
|
130
|
+
visited.add(url);
|
|
131
|
+
// Skip if depth exceeded
|
|
132
|
+
if (depth > validatedMaxDepth)
|
|
133
|
+
continue;
|
|
134
|
+
// Validate URL
|
|
135
|
+
let urlObj;
|
|
136
|
+
try {
|
|
137
|
+
urlObj = new URL(url);
|
|
138
|
+
}
|
|
139
|
+
catch {
|
|
140
|
+
continue; // Skip invalid URLs
|
|
141
|
+
}
|
|
142
|
+
// Check if domain is allowed
|
|
143
|
+
if (!validatedAllowedDomains.includes(urlObj.hostname)) {
|
|
144
|
+
continue;
|
|
145
|
+
}
|
|
146
|
+
// Check exclude patterns
|
|
147
|
+
if (excludeRegexes.some(regex => regex.test(url))) {
|
|
148
|
+
continue;
|
|
149
|
+
}
|
|
150
|
+
// Check robots.txt
|
|
151
|
+
if (respectRobotsTxt && !isAllowedByRobots(url, robotsRules)) {
|
|
152
|
+
console.error(`[Crawler] Skipping ${url} (disallowed by robots.txt)`);
|
|
153
|
+
continue;
|
|
154
|
+
}
|
|
155
|
+
// Fetch the page
|
|
156
|
+
try {
|
|
157
|
+
const result = await peel(url, {
|
|
158
|
+
...peelOptions,
|
|
159
|
+
format: 'markdown',
|
|
160
|
+
});
|
|
161
|
+
results.push({
|
|
162
|
+
url: result.url,
|
|
163
|
+
title: result.title,
|
|
164
|
+
markdown: result.content,
|
|
165
|
+
links: result.links,
|
|
166
|
+
depth,
|
|
167
|
+
parent,
|
|
168
|
+
elapsed: result.elapsed,
|
|
169
|
+
});
|
|
170
|
+
// Add discovered links to queue
|
|
171
|
+
if (depth < validatedMaxDepth) {
|
|
172
|
+
for (const link of result.links) {
|
|
173
|
+
if (!visited.has(link)) {
|
|
174
|
+
queue.push({
|
|
175
|
+
url: link,
|
|
176
|
+
depth: depth + 1,
|
|
177
|
+
parent: url,
|
|
178
|
+
});
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
// Rate limiting
|
|
183
|
+
if (results.length < validatedMaxPages) {
|
|
184
|
+
await new Promise(resolve => setTimeout(resolve, effectiveRateLimit));
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
catch (error) {
|
|
188
|
+
// Log error and continue
|
|
189
|
+
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
|
190
|
+
console.error(`[Crawler] Failed to fetch ${url}: ${errorMessage}`);
|
|
191
|
+
results.push({
|
|
192
|
+
url,
|
|
193
|
+
title: '',
|
|
194
|
+
markdown: '',
|
|
195
|
+
links: [],
|
|
196
|
+
depth,
|
|
197
|
+
parent,
|
|
198
|
+
elapsed: 0,
|
|
199
|
+
error: errorMessage,
|
|
200
|
+
});
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
return results;
|
|
204
|
+
}
|
|
205
|
+
//# sourceMappingURL=crawler.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"crawler.js","sourceRoot":"","sources":["../../src/core/crawler.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,IAAI,EAAE,MAAM,aAAa,CAAC;AAEnC,OAAO,EAAE,KAAK,IAAI,WAAW,EAAE,MAAM,QAAQ,CAAC;AAyC9C;;GAEG;AACH,KAAK,UAAU,cAAc,CAAC,MAAc;IAC1C,MAAM,SAAS,GAAG,WAAW,MAAM,aAAa,CAAC;IAEjD,IAAI,CAAC;QACH,MAAM,QAAQ,GAAG,MAAM,WAAW,CAAC,SAAS,EAAE;YAC5C,OAAO,EAAE;gBACP,YAAY,EAAE,sCAAsC;aACrD;YACD,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,mBAAmB;SACvD,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;YACjB,gDAAgD;YAChD,OAAO,EAAE,eAAe,EAAE,EAAE,EAAE,CAAC;QACjC,CAAC;QAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;QACnC,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAE/B,MAAM,eAAe,GAAa,EAAE,CAAC;QACrC,IAAI,UAA8B,CAAC;QACnC,IAAI,eAAe,GAAG,KAAK,CAAC;QAE5B,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;YAE5B,0BAA0B;YAC1B,IAAI,OAAO,CAAC,WAAW,EAAE,CAAC,UAAU,CAAC,aAAa,CAAC,EAAE,CAAC;gBACpD,MAAM,KAAK,GAAG,OAAO,CAAC,SAAS,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC;gBAC7D,eAAe,GAAG,KAAK,KAAK,GAAG,CAAC;gBAChC,SAAS;YACX,CAAC;YAED,IAAI,CAAC,eAAe;gBAAE,SAAS;YAE/B,4BAA4B;YAC5B,IAAI,OAAO,CAAC,WAAW,EAAE,CAAC,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;gBAClD,MAAM,IAAI,GAAG,OAAO,CAAC,SAAS,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC;gBAC1D,IAAI,IAAI,EAAE,CAAC;oBACT,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBAC7B,CAAC;YACH,CAAC;YAED,8BAA8B;YAC9B,IAAI,OAAO,CAAC,WAAW,EAAE,CAAC,UAAU,CAAC,cAAc,CAAC,EAAE,CAAC;gBACrD,MAAM,KAAK,GAAG,QAAQ,CAAC,OAAO,CAAC,SAAS,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;gBACxE,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;oBAClB,UAAU,GAAG,KAAK,GAAG,IAAI,CAAC,CAAC,0BAA0B;gBACvD,CAAC;YACH,CAAC;QACH,CAAC;QAED,OAAO,EAAE,eAAe,EAAE,UAAU,EAAE,CAAC;IACzC,CAAC;IAAC,MAAM,CAAC;QACP,iDAAiD;QACjD,OAAO,EAAE,eAAe,EAAE,EAAE,EAAE,CAAC;IACjC,CAAC;AACH,CAAC;AAED;;GAEG;AACH,SAAS,iBAAiB,CAAC,GAAW,EAAE,KAAkB;IACxD,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;IAC5B,MAAM,IAAI,GAAG,MAAM,CAAC,QAAQ,CAAC;IAE7B,KAAK,MAAM,UAAU,IAAI,KAAK,CAAC,eAAe,EAAE,CAAC;QAC/C,4EAA4E;QAC5E,IAAI,IAAI,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;YAChC,OAAO,KAAK,CAAC;QACf,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;;;;;;;;;;;;;;;;;GAkBG;AACH,MAAM,CAAC,KAAK,UAAU,KAAK,CACzB,QAAgB,EAChB,UAAwB,EAAE;IAE1B,MAAM,EACJ,QAAQ,GAAG,EAAE,EACb,QAAQ,GAAG,CAAC,EACZ,cAAc,EACd,eAAe,GAAG,EAAE,EACpB,gBAAgB,GAAG,IAAI,EACvB,WAAW,GAAG,IAAI,EAClB,GAAG,WAAW,EACf,GAAG,OAAO,CAAC;IAEZ,kBAAkB;IAClB,MAAM,iBAAiB,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;IAC/D,MAAM,iBAAiB,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IAC7D,MAAM,kBAAkB,GAAG,IAAI,CAAC,GAAG,CAAC,WAAW,EAAE,GAAG,CAAC,CAAC,CAAC,6BAA6B;IAEpF,qBAAqB;IACrB,MAAM,WAAW,GAAG,IAAI,GAAG,CAAC,QAAQ,CAAC,CAAC;IACtC,MAAM,WAAW,GAAG,WAAW,CAAC,QAAQ,CAAC;IAEzC,kDAAkD;IAClD,MAAM,uBAAuB,GAAG,cAAc,IAAI,cAAc,CAAC,MAAM,GAAG,CAAC;QACzE,CAAC,CAAC,cAAc;QAChB,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC;IAElB,2BAA2B;IAC3B,MAAM,cAAc,GAAG,eAAe,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC;IAE3E,6BAA6B;IAC7B,IAAI,WAAW,GAAgB,EAAE,eAAe,EAAE,EAAE,EAAE,CAAC;IACvD,IAAI,gBAAgB,EAAE,CAAC;QACrB,WAAW,GAAG,MAAM,cAAc,CAAC,WAAW,CAAC,CAAC;QAEhD,qEAAqE;QACrE,IAAI,WAAW,CAAC,UAAU,IAAI,WAAW,CAAC,UAAU,GAAG,kBAAkB,EAAE,CAAC;YAC1E,OAAO,CAAC,KAAK,CAAC,gDAAgD,WAAW,CAAC,UAAU,IAAI,CAAC,CAAC;QAC5F,CAAC;IACH,CAAC;IAED,MAAM,kBAAkB,GAAG,WAAW,CAAC,UAAU,IAAI,kBAAkB,CAAC;IAExE,iBAAiB;IACjB,MAAM,OAAO,GAAkB,EAAE,CAAC;IAClC,MAAM,OAAO,GAAG,IAAI,GAAG,EAAU,CAAC;IAClC,MAAM,KAAK,GAAiE;QAC1E,EAAE,GAAG,EAAE,QAAQ,EAAE,KAAK,EAAE,CAAC,EAAE,MAAM,EAAE,IAAI,EAAE;KAC1C,CAAC;IAEF,OAAO,KAAK,CAAC,MAAM,GAAG,CAAC,IAAI,OAAO,CAAC,MAAM,GAAG,iBAAiB,EAAE,CAAC;QAC9D,MAAM,IAAI,GAAG,KAAK,CAAC,KAAK,EAAG,CAAC;QAC5B,MAAM,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,EAAE,GAAG,IAAI,CAAC;QAEpC,0BAA0B;QAC1B,IAAI,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC;YAAE,SAAS;QAC/B,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QAEjB,yBAAyB;QACzB,IAAI,KAAK,GAAG,iBAAiB;YAAE,SAAS;QAExC,eAAe;QACf,IAAI,MAAW,CAAC;QAChB,IAAI,CAAC;YACH,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QACxB,CAAC;QAAC,MAAM,CAAC;YACP,SAAS,CAAC,oBAAoB;QAChC,CAAC;QAED,6BAA6B;QAC7B,IAAI,CAAC,uBAAuB,CAAC,QAAQ,CAAC,MAAM,CAAC,QAAQ,CAAC,EAAE,CAAC;YACvD,SAAS;QACX,CAAC;QAED,yBAAyB;QACzB,IAAI,cAAc,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;YAClD,SAAS;QACX,CAAC;QAED,mBAAmB;QACnB,IAAI,gBAAgB,IAAI,CAAC,iBAAiB,CAAC,GAAG,EAAE,WAAW,CAAC,EAAE,CAAC;YAC7D,OAAO,CAAC,KAAK,CAAC,sBAAsB,GAAG,6BAA6B,CAAC,CAAC;YACtE,SAAS;QACX,CAAC;QAED,iBAAiB;QACjB,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,GAAG,EAAE;gBAC7B,GAAG,WAAW;gBACd,MAAM,EAAE,UAAU;aACnB,CAAC,CAAC;YAEH,OAAO,CAAC,IAAI,CAAC;gBACX,GAAG,EAAE,MAAM,CAAC,GAAG;gBACf,KAAK,EAAE,MAAM,CAAC,KAAK;gBACnB,QAAQ,EAAE,MAAM,CAAC,OAAO;gBACxB,KAAK,EAAE,MAAM,CAAC,KAAK;gBACnB,KAAK;gBACL,MAAM;gBACN,OAAO,EAAE,MAAM,CAAC,OAAO;aACxB,CAAC,CAAC;YAEH,gCAAgC;YAChC,IAAI,KAAK,GAAG,iBAAiB,EAAE,CAAC;gBAC9B,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC;oBAChC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;wBACvB,KAAK,CAAC,IAAI,CAAC;4BACT,GAAG,EAAE,IAAI;4BACT,KAAK,EAAE,KAAK,GAAG,CAAC;4BAChB,MAAM,EAAE,GAAG;yBACZ,CAAC,CAAC;oBACL,CAAC;gBACH,CAAC;YACH,CAAC;YAED,gBAAgB;YAChB,IAAI,OAAO,CAAC,MAAM,GAAG,iBAAiB,EAAE,CAAC;gBACvC,MAAM,IAAI,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,kBAAkB,CAAC,CAAC,CAAC;YACxE,CAAC;QACH,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,yBAAyB;YACzB,MAAM,YAAY,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,CAAC;YAC9E,OAAO,CAAC,KAAK,CAAC,6BAA6B,GAAG,KAAK,YAAY,EAAE,CAAC,CAAC;YAEnE,OAAO,CAAC,IAAI,CAAC;gBACX,GAAG;gBACH,KAAK,EAAE,EAAE;gBACT,QAAQ,EAAE,EAAE;gBACZ,KAAK,EAAE,EAAE;gBACT,KAAK;gBACL,MAAM;gBACN,OAAO,EAAE,CAAC;gBACV,KAAK,EAAE,YAAY;aACpB,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC"}
|
package/dist/core/fetcher.d.ts
CHANGED
|
@@ -5,13 +5,15 @@ export interface FetchResult {
|
|
|
5
5
|
html: string;
|
|
6
6
|
url: string;
|
|
7
7
|
statusCode?: number;
|
|
8
|
+
screenshot?: Buffer;
|
|
9
|
+
contentType?: string;
|
|
8
10
|
}
|
|
9
11
|
/**
|
|
10
12
|
* Simple HTTP fetch using native fetch + Cheerio
|
|
11
13
|
* Fast and lightweight, but can be blocked by Cloudflare/bot detection
|
|
12
14
|
* SECURITY: Manual redirect handling with SSRF re-validation
|
|
13
15
|
*/
|
|
14
|
-
export declare function simpleFetch(url: string, userAgent?: string, timeoutMs?: number): Promise<FetchResult>;
|
|
16
|
+
export declare function simpleFetch(url: string, userAgent?: string, timeoutMs?: number, customHeaders?: Record<string, string>): Promise<FetchResult>;
|
|
15
17
|
/**
|
|
16
18
|
* Fetch using headless Chromium via Playwright
|
|
17
19
|
* Slower but can handle JavaScript-heavy sites and bypass some bot detection
|
|
@@ -20,6 +22,11 @@ export declare function browserFetch(url: string, options?: {
|
|
|
20
22
|
userAgent?: string;
|
|
21
23
|
waitMs?: number;
|
|
22
24
|
timeoutMs?: number;
|
|
25
|
+
screenshot?: boolean;
|
|
26
|
+
screenshotFullPage?: boolean;
|
|
27
|
+
headers?: Record<string, string>;
|
|
28
|
+
cookies?: string[];
|
|
29
|
+
stealth?: boolean;
|
|
23
30
|
}): Promise<FetchResult>;
|
|
24
31
|
/**
|
|
25
32
|
* Retry a fetch operation with exponential backoff
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"fetcher.d.ts","sourceRoot":"","sources":["../../src/core/fetcher.ts"],"names":[],"mappings":"AAAA;;GAEG;
|
|
1
|
+
{"version":3,"file":"fetcher.d.ts","sourceRoot":"","sources":["../../src/core/fetcher.ts"],"names":[],"mappings":"AAAA;;GAEG;AAgQH,MAAM,WAAW,WAAW;IAC1B,IAAI,EAAE,MAAM,CAAC;IACb,GAAG,EAAE,MAAM,CAAC;IACZ,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED;;;;GAIG;AACH,wBAAsB,WAAW,CAC/B,GAAG,EAAE,MAAM,EACX,SAAS,CAAC,EAAE,MAAM,EAClB,SAAS,GAAE,MAAc,EACzB,aAAa,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,GACrC,OAAO,CAAC,WAAW,CAAC,CA2JtB;AAyCD;;;GAGG;AACH,wBAAsB,YAAY,CAChC,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IACP,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB,kBAAkB,CAAC,EAAE,OAAO,CAAC;IAC7B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;IACnB,OAAO,CAAC,EAAE,OAAO,CAAC;CACd,GACL,OAAO,CAAC,WAAW,CAAC,CAkKtB;AAED;;GAEG;AACH,wBAAsB,UAAU,CAAC,CAAC,EAChC,EAAE,EAAE,MAAM,OAAO,CAAC,CAAC,CAAC,EACpB,WAAW,GAAE,MAAU,EACvB,WAAW,GAAE,MAAa,GACzB,OAAO,CAAC,CAAC,CAAC,CAsBZ;AAED;;GAEG;AACH,wBAAsB,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC,CAS7C"}
|
package/dist/core/fetcher.js
CHANGED
|
@@ -2,7 +2,11 @@
|
|
|
2
2
|
* Core fetching logic: simple HTTP and browser-based fetching
|
|
3
3
|
*/
|
|
4
4
|
import { chromium } from 'playwright';
|
|
5
|
+
import { chromium as stealthChromium } from 'playwright-extra';
|
|
6
|
+
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
|
5
7
|
import { TimeoutError, BlockedError, NetworkError, WebPeelError } from '../types.js';
|
|
8
|
+
// Add stealth plugin to playwright-extra
|
|
9
|
+
stealthChromium.use(StealthPlugin());
|
|
6
10
|
const USER_AGENTS = [
|
|
7
11
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
8
12
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
@@ -224,11 +228,31 @@ function validateUserAgent(userAgent) {
|
|
|
224
228
|
* Fast and lightweight, but can be blocked by Cloudflare/bot detection
|
|
225
229
|
* SECURITY: Manual redirect handling with SSRF re-validation
|
|
226
230
|
*/
|
|
227
|
-
export async function simpleFetch(url, userAgent, timeoutMs = 30000) {
|
|
231
|
+
export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeaders) {
|
|
228
232
|
// SECURITY: Validate URL to prevent SSRF
|
|
229
233
|
validateUrl(url);
|
|
230
234
|
// Validate user agent if provided
|
|
231
235
|
const validatedUserAgent = userAgent ? validateUserAgent(userAgent) : getRandomUserAgent();
|
|
236
|
+
// SECURITY: Merge custom headers with defaults, block Host header override
|
|
237
|
+
const defaultHeaders = {
|
|
238
|
+
'User-Agent': validatedUserAgent,
|
|
239
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
240
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
241
|
+
'Accept-Encoding': 'gzip, deflate, br',
|
|
242
|
+
'DNT': '1',
|
|
243
|
+
'Connection': 'keep-alive',
|
|
244
|
+
'Upgrade-Insecure-Requests': '1',
|
|
245
|
+
};
|
|
246
|
+
const mergedHeaders = { ...defaultHeaders };
|
|
247
|
+
if (customHeaders) {
|
|
248
|
+
for (const [key, value] of Object.entries(customHeaders)) {
|
|
249
|
+
// SECURITY: Block Host header override
|
|
250
|
+
if (key.toLowerCase() === 'host') {
|
|
251
|
+
throw new WebPeelError('Custom Host header is not allowed');
|
|
252
|
+
}
|
|
253
|
+
mergedHeaders[key] = value;
|
|
254
|
+
}
|
|
255
|
+
}
|
|
232
256
|
const MAX_REDIRECTS = 10;
|
|
233
257
|
let redirectCount = 0;
|
|
234
258
|
let currentUrl = url;
|
|
@@ -245,15 +269,7 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000) {
|
|
|
245
269
|
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
|
246
270
|
try {
|
|
247
271
|
const response = await fetch(currentUrl, {
|
|
248
|
-
headers:
|
|
249
|
-
'User-Agent': validatedUserAgent,
|
|
250
|
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
251
|
-
'Accept-Language': 'en-US,en;q=0.9',
|
|
252
|
-
'Accept-Encoding': 'gzip, deflate, br',
|
|
253
|
-
'DNT': '1',
|
|
254
|
-
'Connection': 'keep-alive',
|
|
255
|
-
'Upgrade-Insecure-Requests': '1',
|
|
256
|
-
},
|
|
272
|
+
headers: mergedHeaders,
|
|
257
273
|
signal: controller.signal,
|
|
258
274
|
redirect: 'manual', // SECURITY: Manual redirect handling
|
|
259
275
|
});
|
|
@@ -277,8 +293,10 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000) {
|
|
|
277
293
|
}
|
|
278
294
|
// SECURITY: Validate Content-Type
|
|
279
295
|
const contentType = response.headers.get('content-type') || '';
|
|
280
|
-
if (!contentType.includes('text/html') &&
|
|
281
|
-
|
|
296
|
+
if (!contentType.includes('text/html') &&
|
|
297
|
+
!contentType.includes('application/xhtml+xml') &&
|
|
298
|
+
!contentType.includes('application/pdf')) {
|
|
299
|
+
throw new WebPeelError(`Unsupported content type: ${contentType}. Supported: HTML, PDF`);
|
|
282
300
|
}
|
|
283
301
|
// SECURITY: Stream response with size limit (prevent memory exhaustion)
|
|
284
302
|
const chunks = [];
|
|
@@ -323,6 +341,7 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000) {
|
|
|
323
341
|
html,
|
|
324
342
|
url: currentUrl,
|
|
325
343
|
statusCode: response.status,
|
|
344
|
+
contentType,
|
|
326
345
|
};
|
|
327
346
|
}
|
|
328
347
|
catch (error) {
|
|
@@ -339,6 +358,7 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000) {
|
|
|
339
358
|
throw new WebPeelError(`Too many redirects (max ${MAX_REDIRECTS})`);
|
|
340
359
|
}
|
|
341
360
|
let sharedBrowser = null;
|
|
361
|
+
let sharedStealthBrowser = null;
|
|
342
362
|
let activePagesCount = 0;
|
|
343
363
|
const MAX_CONCURRENT_PAGES = 5;
|
|
344
364
|
async function getBrowser() {
|
|
@@ -357,6 +377,22 @@ async function getBrowser() {
|
|
|
357
377
|
sharedBrowser = await chromium.launch({ headless: true });
|
|
358
378
|
return sharedBrowser;
|
|
359
379
|
}
|
|
380
|
+
async function getStealthBrowser() {
|
|
381
|
+
// SECURITY: Check if stealth browser is still connected and healthy
|
|
382
|
+
if (sharedStealthBrowser) {
|
|
383
|
+
try {
|
|
384
|
+
if (sharedStealthBrowser.isConnected()) {
|
|
385
|
+
return sharedStealthBrowser;
|
|
386
|
+
}
|
|
387
|
+
}
|
|
388
|
+
catch {
|
|
389
|
+
// Browser is dead, recreate
|
|
390
|
+
sharedStealthBrowser = null;
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
sharedStealthBrowser = await stealthChromium.launch({ headless: true });
|
|
394
|
+
return sharedStealthBrowser;
|
|
395
|
+
}
|
|
360
396
|
/**
|
|
361
397
|
* Fetch using headless Chromium via Playwright
|
|
362
398
|
* Slower but can handle JavaScript-heavy sites and bypass some bot detection
|
|
@@ -364,13 +400,25 @@ async function getBrowser() {
|
|
|
364
400
|
export async function browserFetch(url, options = {}) {
|
|
365
401
|
// SECURITY: Validate URL to prevent SSRF
|
|
366
402
|
validateUrl(url);
|
|
367
|
-
const { userAgent, waitMs = 0, timeoutMs = 30000 } = options;
|
|
403
|
+
const { userAgent, waitMs = 0, timeoutMs = 30000, screenshot = false, screenshotFullPage = false, headers, cookies, stealth = false } = options;
|
|
368
404
|
// Validate user agent if provided
|
|
369
405
|
const validatedUserAgent = userAgent ? validateUserAgent(userAgent) : getRandomUserAgent();
|
|
370
406
|
// Validate wait time
|
|
371
407
|
if (waitMs < 0 || waitMs > 60000) {
|
|
372
408
|
throw new WebPeelError('Wait time must be between 0 and 60000ms');
|
|
373
409
|
}
|
|
410
|
+
// SECURITY: Validate custom headers if provided
|
|
411
|
+
if (headers) {
|
|
412
|
+
for (const [key, value] of Object.entries(headers)) {
|
|
413
|
+
// Block Host header override
|
|
414
|
+
if (key.toLowerCase() === 'host') {
|
|
415
|
+
throw new WebPeelError('Custom Host header is not allowed');
|
|
416
|
+
}
|
|
417
|
+
if (typeof value !== 'string' || value.length > 500) {
|
|
418
|
+
throw new WebPeelError('Invalid header value');
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
}
|
|
374
422
|
// SECURITY: Limit concurrent browser pages with timeout
|
|
375
423
|
const queueStartTime = Date.now();
|
|
376
424
|
const QUEUE_TIMEOUT_MS = 30000; // 30 second max wait
|
|
@@ -383,20 +431,46 @@ export async function browserFetch(url, options = {}) {
|
|
|
383
431
|
activePagesCount++;
|
|
384
432
|
let page = null;
|
|
385
433
|
try {
|
|
386
|
-
const browser = await getBrowser();
|
|
434
|
+
const browser = stealth ? await getStealthBrowser() : await getBrowser();
|
|
387
435
|
page = await browser.newPage({
|
|
388
436
|
userAgent: validatedUserAgent,
|
|
389
437
|
});
|
|
390
|
-
//
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
438
|
+
// Set custom headers if provided
|
|
439
|
+
if (headers && Object.keys(headers).length > 0) {
|
|
440
|
+
await page.setExtraHTTPHeaders(headers);
|
|
441
|
+
}
|
|
442
|
+
// Set cookies if provided
|
|
443
|
+
if (cookies && cookies.length > 0) {
|
|
444
|
+
const parsedCookies = cookies.map(cookie => {
|
|
445
|
+
const [nameValue] = cookie.split(';').map(s => s.trim());
|
|
446
|
+
const [name, value] = nameValue.split('=');
|
|
447
|
+
if (!name || value === undefined) {
|
|
448
|
+
throw new WebPeelError(`Invalid cookie format: ${cookie}`);
|
|
449
|
+
}
|
|
450
|
+
return {
|
|
451
|
+
name: name.trim(),
|
|
452
|
+
value: value.trim(),
|
|
453
|
+
url,
|
|
454
|
+
};
|
|
455
|
+
});
|
|
456
|
+
await page.context().addCookies(parsedCookies);
|
|
457
|
+
}
|
|
458
|
+
// Block images, fonts, and other heavy resources for speed (unless screenshot is requested)
|
|
459
|
+
if (!screenshot) {
|
|
460
|
+
await page.route('**/*', (route) => {
|
|
461
|
+
const resourceType = route.request().resourceType();
|
|
462
|
+
if (['image', 'font', 'media', 'stylesheet'].includes(resourceType)) {
|
|
463
|
+
route.abort();
|
|
464
|
+
}
|
|
465
|
+
else {
|
|
466
|
+
route.continue();
|
|
467
|
+
}
|
|
468
|
+
});
|
|
469
|
+
}
|
|
470
|
+
else {
|
|
471
|
+
// For screenshots, allow all resources
|
|
472
|
+
await page.route('**/*', (route) => route.continue());
|
|
473
|
+
}
|
|
400
474
|
// SECURITY: Wrap entire operation in timeout
|
|
401
475
|
const fetchPromise = (async () => {
|
|
402
476
|
await page.goto(url, {
|
|
@@ -422,9 +496,18 @@ export async function browserFetch(url, options = {}) {
|
|
|
422
496
|
if (!html || html.length < 100) {
|
|
423
497
|
throw new BlockedError('Empty or suspiciously small response from browser.');
|
|
424
498
|
}
|
|
499
|
+
// Capture screenshot if requested
|
|
500
|
+
let screenshotBuffer;
|
|
501
|
+
if (screenshot) {
|
|
502
|
+
screenshotBuffer = await page.screenshot({
|
|
503
|
+
fullPage: screenshotFullPage,
|
|
504
|
+
type: 'png'
|
|
505
|
+
});
|
|
506
|
+
}
|
|
425
507
|
return {
|
|
426
508
|
html,
|
|
427
509
|
url: finalUrl,
|
|
510
|
+
screenshot: screenshotBuffer,
|
|
428
511
|
};
|
|
429
512
|
}
|
|
430
513
|
catch (error) {
|
|
@@ -475,5 +558,9 @@ export async function cleanup() {
|
|
|
475
558
|
await sharedBrowser.close();
|
|
476
559
|
sharedBrowser = null;
|
|
477
560
|
}
|
|
561
|
+
if (sharedStealthBrowser) {
|
|
562
|
+
await sharedStealthBrowser.close();
|
|
563
|
+
sharedStealthBrowser = null;
|
|
564
|
+
}
|
|
478
565
|
}
|
|
479
566
|
//# sourceMappingURL=fetcher.js.map
|