scrapex 1.0.0-alpha.1 → 1.0.0-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +164 -5
- package/dist/embeddings/index.cjs +52 -0
- package/dist/embeddings/index.d.cts +3 -0
- package/dist/embeddings/index.d.mts +3 -0
- package/dist/embeddings/index.mjs +4 -0
- package/dist/embeddings-BjNTQSG9.cjs +1455 -0
- package/dist/embeddings-BjNTQSG9.cjs.map +1 -0
- package/dist/embeddings-Bsymy_jA.mjs +1215 -0
- package/dist/embeddings-Bsymy_jA.mjs.map +1 -0
- package/dist/{enhancer-oM4BhYYS.cjs → enhancer-Cs_WyWtJ.cjs} +2 -51
- package/dist/enhancer-Cs_WyWtJ.cjs.map +1 -0
- package/dist/{enhancer-Q6CSc1gA.mjs → enhancer-INx5NlgO.mjs} +2 -45
- package/dist/enhancer-INx5NlgO.mjs.map +1 -0
- package/dist/http-base-CHLf-Tco.cjs +684 -0
- package/dist/http-base-CHLf-Tco.cjs.map +1 -0
- package/dist/http-base-DM7YNo6X.mjs +618 -0
- package/dist/http-base-DM7YNo6X.mjs.map +1 -0
- package/dist/index-Bvseqli-.d.cts +268 -0
- package/dist/index-Bvseqli-.d.cts.map +1 -0
- package/dist/index-CIFjNySr.d.mts +268 -0
- package/dist/index-CIFjNySr.d.mts.map +1 -0
- package/dist/index-D6qfjmZQ.d.mts +401 -0
- package/dist/index-D6qfjmZQ.d.mts.map +1 -0
- package/dist/index-RFSpP5g8.d.cts +401 -0
- package/dist/index-RFSpP5g8.d.cts.map +1 -0
- package/dist/index.cjs +171 -51
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +61 -2
- package/dist/index.d.cts.map +1 -1
- package/dist/index.d.mts +61 -2
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +129 -6
- package/dist/index.mjs.map +1 -1
- package/dist/llm/index.cjs +252 -233
- package/dist/llm/index.cjs.map +1 -1
- package/dist/llm/index.d.cts +132 -85
- package/dist/llm/index.d.cts.map +1 -1
- package/dist/llm/index.d.mts +132 -85
- package/dist/llm/index.d.mts.map +1 -1
- package/dist/llm/index.mjs +244 -236
- package/dist/llm/index.mjs.map +1 -1
- package/dist/parsers/index.cjs +10 -199
- package/dist/parsers/index.d.cts +2 -133
- package/dist/parsers/index.d.mts +2 -133
- package/dist/parsers/index.mjs +2 -191
- package/dist/parsers-Bneuws8x.cjs +569 -0
- package/dist/parsers-Bneuws8x.cjs.map +1 -0
- package/dist/parsers-DsawHeo0.mjs +482 -0
- package/dist/parsers-DsawHeo0.mjs.map +1 -0
- package/dist/types-BOcHQU9s.d.mts +831 -0
- package/dist/types-BOcHQU9s.d.mts.map +1 -0
- package/dist/types-DutdBpqd.d.cts +831 -0
- package/dist/types-DutdBpqd.d.cts.map +1 -0
- package/package.json +15 -16
- package/dist/enhancer-Q6CSc1gA.mjs.map +0 -1
- package/dist/enhancer-oM4BhYYS.cjs.map +0 -1
- package/dist/parsers/index.cjs.map +0 -1
- package/dist/parsers/index.d.cts.map +0 -1
- package/dist/parsers/index.d.mts.map +0 -1
- package/dist/parsers/index.mjs.map +0 -1
- package/dist/types-CNQZVW36.d.mts +0 -150
- package/dist/types-CNQZVW36.d.mts.map +0 -1
- package/dist/types-D0HYR95H.d.cts +0 -150
- package/dist/types-D0HYR95H.d.cts.map +0 -1
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types-DutdBpqd.d.cts","names":[],"sources":["../src/core/types.ts","../src/common/resilience.ts","../src/common/http-base.ts","../src/embeddings/providers/http.ts","../src/embeddings/types.ts"],"sourcesContent":[],"mappings":";;;;;AAMA;AAaA;AASiB,KAtBL,WAAA,GAsBsB,SAAA,GAAA,MAAA,GAAA,MAAA,GAAA,SAAA,GAAA,OAAA,GAAA,MAAA,GAAA,SAAA,GAAA,SAAA;AAWlC;;;AAiCU,UArDO,aAAA,CAqDP;EAKG,GAAA,EAAA,MAAA;EACC,IAAA,EAAA,MAAA;EAGH,UAAA,EAAA,OAAA;;;AAcX;AAKA;AAQY,UAhFK,iBAAA,CAgFW;EAKX,MAAA,EAAA,MAAW,EAAA;EAES,aAAA,EAAA,MAAA,EAAA;EAAoB,YAAA,EAAA,MAAA,EAAA;EACG,SAAA,EAAA,MAAA,EAAA;EAAR,QAAA,EAAA,MAAA,EAAA;;AAGpD;AASA;;AAEqD,UA3FpC,WAAA,CA2FoC;EAAR,GAAA,EAAA,MAAA;EAAO,YAAA,EAAA,MAAA;EAGnC,MAAA,EAAA,MAAA;EAMA,KAAA,EAAA,MAAA;EAWA,WAAA,EAAS,MAAA;EAGP,KAAA,CAAA,EAAA,MAAA;EAAoC,OAAA,CAAA,EAAA,MAAA;EAAR,OAAA,EAAA,MAAA;EAAR,WAAA,EAAA,MAAA;EAAO,OAAA,EAAA,MAAA;EAM7B,SAAA,EAAA,MAAA;EAOZ,MAAA,CAAA,EAAA,MAAA;EAGY,WAAA,CAAA,EAAA,MAAA;EAGE,UAAA,CAAA,EAAA,MAAA;EAAR,QAAA,CAAA,EAAA,MAAA;EAGA,QAAA,CAAA,EAAA,MAAA;EAAa,WAAA,EA9GT,WA8GS;EAMP,QAAA,EAAA,MAAa,EAAA;EAclB,MAAA,CAAA,EA9HD,MA8HC,CAAA,MAAA,EAAA,OAAA,CAAA,EAAA;EAGG,KAAA,CAAA,EA9HL,aA8HK,EAAA;EASP,OAAA,CAAA,EAAA,MAAA;EAGI,aAAA,CAAA,EAAA,MAAA,EAAA;EAGA,QAAA,CAAA,EAxIC,iBAwID;EAGG,SAAA,CAAA,EA1ID,MA0IC,CAAA,MAAA,EAAA,OAAA,CAAA;EAAgB,MAAA,CAAA,EAvIpB,MAuIoB,CAAA,MAAA,EAAA,OAAA,CAAA;eApIhB;;;ECxEE,KAAA,CAAA,EAAA,MAAA;AAcjB;AAUA;AAUA;AAeA;AACU,KDiCE,eAAA,GCjCF,WAAA,GAAA,MAAA,GAAA,UAAA,GAAA,UAAA;;;;AAQe,KD8Bb,oBAAA,GC9Ba,QAAA,GAAA,QAAA,GAAA,SAAA,GAAA,UAAA,GAAA,UAAA,GAAA,GAAA,MAAA,GAAA;AAMR,KDgCL,gBAAA,GAAmB,MChCC,CAAA,MAAA,EDgCc,oBChCd,CAAA;;;;AAWP,UD0BR,WAAA,CC1BQ;EAAqB,SAAA,IAAA,EAAA,MAAA;EAAR,QAAA,CAAA,MAAA,EAAA,MAAA,EAAA,OAAA,CAAA,ED4BD,iBC5BC,CAAA,ED4BmB,OC5BnB,CAAA,MAAA,CAAA;EAAO,YAAA,CAAA,CAAA,CAAA,CAAA,MAAA,EAAA,MAAA,EAAA,MAAA,EAAA,OAAA,CAAA,ED6BO,OC7BP,CD6Be,CC7Bf,CAAA;AAkC7C;AAoDsB,UDtDL,iBAAA,CCsDc;EACX,SAAA,CAAA,EAAA,MAAA;EAAR,WAAA,CAAA,EAAA,MAAA;EACD,YAAA,CAAA,EAAA,MAAA;;;;;AA4CW,UD3FL,OAAA,CC2FgB;EAClB,SAAA,IAAA,EAAA,MAAA;EAAwB,KAAA,CAAA,GAAA,EAAA,MAAA,EAAA,OAAA,ED1FT,cC0FS,CAAA,ED1FM,OC0FN,CD1Fc,aC0Fd,CAAA;;AAE5B,UDzFM,cAAA,CCyFN;EAAR,OAAA,CAAA,EAAA,MAAA;EAAO,SAAA,CAAA,EAAA,MAAA;EAeM,OAAA,CAAA,EDrGJ,MCqGI,CAAA,MAAmB,EAAA,MAAA,CAAA;AA0CnC;AAaa,UDzJI,aAAA,CCyJU;EAKJ,IAAA,EAAA,MAAA;EAoBT,QAAA,EAAA,MAAA;EA+BuB,UAAA,EAAA,MAAA;EAAR,WAAA,EAAA,MAAA;EAAqB,OAAA,CAAA,ED5MtC,MC4MsC,CAAA,MAAA,EAAA,MAAA,CAAA;;;AAiDlD;AA+FA;AAWmB,UDjWF,SAAA,CCiWE;EA0BkB,SAAA,IAAA,EAAA,MAAA;EAAR,SAAA,QAAA,CAAA,EAAA,MAAA;EAAqB,OAAA,CAAA,OAAA,EDxX/B,iBCwX+B,CAAA,EDxXX,OCwXW,CDxXH,OCwXG,CDxXK,WCwXL,CAAA,CAAA;;;AA0BlD;;AACuC,UD7YtB,iBAAA,CC6YsB;EAAR,GAAA,EAAA,MAAA;EACpB,QAAA,EAAA,MAAA;EACD,IAAA,EAAA,MAAA;EAE6B,CAAA,ED1YlC,UC0YkC;EAElB,WAAA,EAAA,EDzYJ,QCyYI;EAAlB,OAAA,EDtYQ,OCsYR,CDtYgB,WCsYhB,CAAA;EAAO,OAAA,EDnYC,aCmYD;;;;ACtfV;AAMY,UFmHK,aAAA,CEnHL;EAEe;EAOZ,OAAA,CAAA,EAAA,MAAA;EAAgB;EAgHd,SAAA,CAAA,EAAA,MAAY;EAUZ;EASK,cAAA,CAAA,EAAA,OAAgB;EAGR;EACgB,gBAAA,CAAA,EAAA,MAAA;EAUX;EAQE,OAAA,CAAA,EFjCzB,OEiCyB;EAAf;EA2CE,UAAA,CAAA,EFzET,SEyES,EAAA;EAWyB;EAAwC,wBAAA,CAAA,EAAA,OAAA;EAAZ;EAAR,aAAA,CAAA,EAAA,OAAA;EAAO;QF3EpE;;YAGI;EGlMK;EACQ,OAAA,CAAA,EHoMb,gBGpMa;EAK8B;EAKzB,UAAA,CAAA,EH6Lf,gBG7Le;;;;;;;AHrB9B;AAaA;AASA;AAWA;AA0Be,UCrDE,aAAA,CDqDF;EAIJ;EAGD,WAAA,CAAA,EAAA,MAAA;EAKG;EACC,SAAA,CAAA,EAAA,MAAA;EAGH;EAGI,iBAAA,CAAA,EAAA,MAAA;EAAe;EAWlB,iBAAA,CAAe,EAAA,MAAA,EAAA;AAK3B;AAQA;AAKA;;AAEyD,UCzFxC,sBAAA,CDyFwC;EACG;EAAR,gBAAA,CAAA,EAAA,MAAA;EAAO;EAG1C,cAAA,CAAA,EAAA,MAAiB;AASlC;;;;AAEoD,UC9FnC,iBAAA,CD8FmC;EAGnC;EAMA,iBAAW,CAAA,EAAA,MAAA;EAWX;EAGE,eAAA,CAAA,EAAA,MAAA;;;;;AAMF,KCjHL,cAAA,GDiHsB,QAAA,GAAA,MAAA,GAAA,WAAA;;;;AAyDnB,UC3JE,kBAAA,CD2JF;EAAgB,KAAA,CAAA,EC1JrB,aD0JqB;mBCzJZ;cACL;;EApDG,SAAA,CAAA,EAAA,MAAW;EAcX;EAUA,WAAA,CAAA,EAAA,MAAe;EAUpB;EAeK,KAAA,CAAA,EASP,iBATuB;;;;;AASR,UAMR,iBAAA,CANQ;EAMR,cAAA,CAAA,EAAA;IAKA,MAAA,EAAA,EAAA,OAAA;IAGF,aAAA,EAAA,EAAA,IAAA;IAGkB,aAAA,EAAA,EAAA,IAAA;IAAR,QAAA,GAAA,EANR,cAMQ;EAAqB,CAAA;EAAR,WAAA,CAAA,EAAA;IAAO,OAAA,EAAA,EAH9B,OAG8B,CAAA,IAAA,CAAA;EAkC7B,CAAA;EAoDM,SAAA,CAAA,EAAS;IACX,OAAA,CAAA,CAAA,CAAA,CAAA,EAAA,EAAA,GAAA,GAvFK,OAuFL,CAvFa,CAuFb,CAAA,CAAA,EAvFkB,OAuFlB,CAvF0B,CAuF1B,CAAA;EAAR,CAAA;;;;;AAGF,iBAxDM,gBAAA,CAwDN,KAAA,EAAA,OAAA,EAAA,iBAAA,CAAA,EAAA,MAAA,EAAA,CAAA,EAAA,OAAA;;;;AA4DM,iBAhEM,SAgEa,CAAA,CAAA,CAAA,CAAA,EAAmC,EAAA,GAAA,GA/D1D,OA+DwE,CA/DhE,CA+DgE,CAAA,EAAA,MAAW,CAAA,EA9DpF,aA8DoF,EAAA,OAAA,CAAA,EAAA,CAAA,OAAA,EAAA,MAAA,EAAA,KAAA,EA7D1D,KA6D0D,EAAA,OAAA,EAAA,MAAA,EAAA,GAAA,IAAA,CAAA,EA5D5F,OA4D4F,CAAA;EA0ClF,MAAA,EAtGQ,CAsGR;EAaA,QAAA,EAAA,MAAA;CAKU,CAAA;;;;AAmD2B,iBAjI5B,WAiI4B,CAAA,CAAA,CAAA,CAAA,EAAA,EAAA,CAAA,MAAA,EAhInC,WAgImC,EAAA,GAhInB,OAgImB,CAhIX,CAgIW,CAAA,EAAA,SAAA,EAAA,MAAA,CAAA,EA9H/C,OA8H+C,CA9HvC,CA8HuC,CAAA;;;AAiDlD;AA+FA;AAWmB,iBA1QH,mBAAA,CA0QG,SAAA,EAAA,MAAA,EAAA,YAAA,CAAA,EA1QmD,WA0QnD,CAAA,EA1QiE,WA0QjE;;;;AA0BuB,cA1P7B,gBAAA,SAAyB,KAAA,CA0PI;EAAO,SAAA,aAAA,GAAA,IAAA;EA0B3B,WAAA,CAAA,OAAc,EAAA,MAAA;;;;;;AAKG,cA5Q1B,cAAA,CA4Q0B;EAElB,QAAA,KAAA;EAAlB,iBAAA,gBAAA;EAAO,iBAAA,cAAA;uBAzQa;;;AC7OvB;EAMY,MAAA,CAAA,CAAA,EAAA,OAAA;EAEe;;;EAuHV,QAAA,CAAA,CAAA,EDkIH,cClIe;EAUZ;AASjB;;EAI8C,aAAA,CAAA,CAAA,EAAA,IAAA;EAUX;;;EAmDX,aAAA,CAAA,CAAA,EAAA,IAAA;EAWyB;;;EAAoB,OAAA,CAAA,CAAA,CAAA,CAAA,EAAA,EAAA,GAAA,GDkExC,OClEwC,CDkEhC,CClEgC,CAAA,CAAA,EDkE3B,OClE2B,CDkEnB,CClEmB,CAAA;EAAO;;;;EC1Q3D;;;EAWa,QAAA,WAAA;;;AAW9B;;AAS0C,cF8V7B,WAAA,CE9V6B;EAAU,QAAA,MAAA;EAAW,QAAA,UAAA;EAAzC,iBAAA,SAAA;EAgDkB,iBAAA,UAAA;EAAuB,WAAA,CAAA,MAAA,EFoTzC,iBEpTyC;EAAR;;;EAvDzB,UAAA,CAAA,CAAA,EAAA,OAAA;EAiFd;;;;EACN,UAAA,CAAA,MAAA,CAAA,EAAA,MAAA,CAAA,EAAA,OAAA;EACP;;;4BF4T0B;;AG1Y7B;AAcA;EAKkC,WAAA,CAAA,CAAA,EAAA,MAAA;EAAuB;;;EAGxC,QAAA,MAAA;AAOjB;AAYA;AAEA;;AAU6B,cH0YhB,SAAA,CG1YgB;EAAR,QAAA,OAAA;EAAO,QAAA,OAAA;EAWX,WAAA,CAAA,OAAc,EAAA,MAAA;EA6BnB;AAMZ;AAcA;EAeiB,OAAA,CAAA,CAAA,EH0UE,OG1UU,CAAA,IAAA,CAAA;EA2BZ;AAiBjB;;EACoB,OAAA,CAAA,CAAA,EAAA,IAAA;EACM;;;EAEf,OAAA,CAAA,CAAA,CAAA,CAAA,EAAA,EAAA,GAAA,GHoTkB,OGpTlB,CHoT0B,CGpT1B,CAAA,CAAA,EHoT+B,OGpT/B,CHoTuC,CGpTvC,CAAA;;AAOX;AAkBA;AAaA;AAOA;;;;;;;;AAeA;AAEU,iBHgRY,cGhRZ,CAAA,CAAA,CAAA,CAAA,EAAA,EAAA,CAAA,MAAA,EHiRK,WGjRL,EAAA,GHiRqB,OGjRrB,CHiR6B,CGjR7B,CAAA,EAAA,MAAA,CAAA,EHkRC,kBGlRD,EAAA,KAAA,CAAA,EHmRA,iBGnRA,EAAA,SAII,CAJJ,EAAA;EAES,OAAA,CAAA,EAAA,CAAA,OAAA,EAAA,MAAA,EAAA,KAAA,EHmRoB,KGnRpB,EAAA,OAAA,EAAA,MAAA,EAAA,GAAA,IAAA;CAEL,CAAA,EHmRX,OGnRW,CAAA;EAKJ,MAAA,EH8QW,CG9QX;EAAe,QAAA,EAAA,MAAA;AAiBzB,CAAA,CAAA;;;AJjNA;AAQA;AAKA;AAEqC,UEvDpB,cFuDoB,CAAA,SAAA,OAAA,CAAA,CAAA;EAAoB;EACG,OAAA,EAAA,MAAA;EAAR;EAAO,KAAA,EAAA,MAAA;EAG1C;EASA,OAAA,CAAA,EE9DL,MF8DY,CAAA,MAAA,EAAA,MAAA,CAAA;EAEM;EAAuB,WAAA,CAAA,EAAA,CAAA,QAAA,EE9D1B,MF8D0B,EAAA,GAAA,MAAA;EAAR;EAAO,YAAA,CAAA,EAAA,OAAA;EAGnC,YAAA,CAAA,EAAA,OAAY;EAMZ,UAAA,CAAA,EAAA,OAAW;EAWX,cAAS,CAAA,EAAA,OAAA;EAGP;EAAoC,UAAA,CAAA,EE9ExC,kBF8EwC;;;;;AA+DxC,UE7BE,YAAA,CF6BF;EAAgB,MAAA,CAAA,EAAA,KAAA,GAAA,MAAA;;YE1BnB;WACD;ADnLX;AAcA;AAUA;AAUA;AAeiB,UCwIA,WDxIgB,CAAA,CAAA,CAAA,CAAA;EACvB,IAAA,ECwIF,CDxIE;EACS,MAAA,EAAA,MAAA;EACL,OAAA,ECwIH,ODxIG;;;AAYd;;AAQe,uBC0HO,gBD1HP,CAAA,SAAA,OAAA,CAAA,CAAA;EAGkB,mBAAA,OAAA,EAAA,MAAA;EAAR,mBAAA,KAAA,EAAA,MAAA;EAAqB,mBAAA,OAAA,EC0HhB,MD1HgB,CAAA,MAAA,EAAA,MAAA,CAAA;EAAR,mBAAA,WAAA,CAAA,EAAA,CAAA,QAAA,EC2HQ,MD3HR,EAAA,GAAA,MAAA;EAAO,mBAAA,YAAA,EAAA,OAAA;EAkC7B,mBAAgB,YAAA,EAAA,OAAA;EAoDV,mBAAS,UAAA,EAAA,OAAA;EACX,mBAAA,cAAA,EAAA,OAAA;EAAR,mBAAA,SAAA,EAAA,MAAA;EACD,mBAAA,WAAA,CAAA,EC6CwB,aD7CxB;EAC0B,mBAAA,WAAA,EAAA,MAAA;EAChB,QAAA,cAAA;EAAlB,QAAA,WAAA;EAAO,QAAA,SAAA;EA0CY,WAAA,CAAA,MAAW,ECSX,cDTW,CCSI,MDTJ,CAAA;EAClB;;;EAEJ,kBAAA,CAAA,CAAA,ECiDa,iBDjDb;EAAR;;AAeH;EA0Ca,UAAA,KAAA,CAAA,CAAA,CAAA,CAAiB,GAAA,EAAA,MAAA,EAAQ,OAAK,CAAA,ECGM,YDHN,CAAA,ECG0B,ODH1B,CCGkC,WDHlC,CCG8C,CDH9C,CAAA,CAAA;AAa3C;;;AD7PA;;;AAiCU,UGxDO,mBHwDP,CAAA,WAAA,OAAA,EAAA,YAAA,OAAA,EAAA,SAAA,OAAA,CAAA,SGvDA,cHuDA,CGvDe,MHuDf,CAAA,CAAA;EAKG;;;;EAOiB,cAAA,CAAA,EAAA,CAAA,KAAA,EAAA,MAAA,EAAA,EAAA,KAAA,EAAA,MAAA,EAAA,GG9DyB,QH8DzB;EAWlB;AAKZ;AAQA;AAKA;EAEqC,cAAA,CAAA,EAAA,CAAA,QAAA,EGxFP,SHwFO,EAAA,GAAA,MAAA,EAAA,EAAA;;;;;AAIrC;AASiB,cG1FJ,qBH0FW,CAAA,WAAA,OAAA,EAAA,YAAA,OAAA,EAAA,SAAA,OAAA,CAAA,SGzFd,gBHyFc,CGzFG,MHyFH,CAAA,YGxFX,iBHwFW,CAAA;EAEM,SAAA,IAAA,GAAA,gBAAA;EAAuB,iBAAA,cAAA;EAAR,iBAAA,cAAA;EAAO,WAAA,CAAA,MAAA,EGnF9B,mBHmF8B,CGnFV,QHmFU,EGnFA,SHmFA,EGnFW,MHmFX,CAAA;EAGnC;AAMjB;AAWA;EAGmB,KAAA,CAAA,KAAA,EAAA,MAAA,EAAA,EAAA,OAAA,EG1DqB,YH0DrB,CAAA,EG1DoC,OH0DpC,CG1D4C,aH0D5C,CAAA;;;;;AAMF,iBGtCD,mBHsCkB,CAAA,WAAA,OAAA,EAAA,YAAA,OAAA,EAAA,SAAA,OAAA,CAAA,CAAA,MAAA,EGrCxB,mBHqCwB,CGrCJ,QHqCI,EGrCM,SHqCN,EGrCiB,MHqCjB,CAAA,CAAA,EGpC/B,iBHoC+B;;;;AAzJlC;AAaA;AASA;AAWA;;;;;;;;;AAwDA;AAKA;AAQA;AAKA;;;;;;AAMA;AASA;;;;;AAKA;AAMA;AAWA;;;;;;AASA;;AAUiB,KI5HL,uBAAA,GJ4HK;EAGE,IAAA,EAAA,MAAA;EAAR,MAAA,EI9HiB,mBJ8HjB;CAGA,GAAA;EAAa,IAAA,EAAA,QAAA;EAMP,QAAA,EItIe,iBJsIF;CAclB;;;;AAqBG,UI7JE,iBAAA,CJ6JF;EAAgB,SAAA,IAAA,EAAA,MAAA;;;;EC5Md,KAAA,CAAA,KAAA,EAAA,MAAW,EAAA,EAAA,OAAA,EGoDM,YHpDN,CAAA,EGoDqB,OHpDrB,CGoD6B,aHpD7B,CAAA;AAc5B;AAUiB,UG+BA,YAAA,CH/Be;EAUpB;EAeK,KAAA,CAAA,EAAA,MAAA;EACP,UAAA,CAAA,EAAA,MAAA;EACS,MAAA,CAAA,EGQR,WHRQ;;AAOT,UGIO,aAAA,CHJP;EAAe,UAAA,EAAA,MAAA,EAAA,EAAA;EAMR,KAAA,CAAA,EAAA;IAKA,YAAA,EAAA,MAAA;IAGF,WAAA,EAAA,MAAA;EAGkB,CAAA;;AAAa,KGDlC,kBAAA,GHCkC,aAAA,GAAA,eAAA,GAAA,QAAA;AAAR,UGCrB,oBAAA,CHDqB;EAAO;AAkC7C;AAoDA;;EACY,IAAA,CAAA,EGjFH,kBHiFG;EACD;;;;EAED,SAAA,CAAA,EAAA,CAAA,IAAA,EG/EW,OH+EX,CG/EmB,WH+EnB,CAAA,EAAA,GAAA,MAAA;EA0CY;;;EACS,UAAA,CAAA,EAAA,MAAA;;AAE5B,UGjHc,cAAA,CHiHd;EAAO;AAeV;AA0CA;AAaA;EAKuB,IAAA,CAAA,EAAA,MAAA;EAoBT;;;;EA+B4B,OAAA,CAAA,EAAA,MAAA;EAAO;AAiDjD;AA+FA;;;;EAqCkD,SAAA,CAAA,EAAA,WAAA,GAAA,UAAA,GAAA,CAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA,MAAA,CAAA;EAAR;;AA0B1C;;EACuC,cAAA,CAAA,EAAA,MAAA;;AAC5B,KGnaC,oBAAA,GHmaD,SAAA,GAAA,KAAA,GAAA,OAAA,GAAA,KAAA;AACD,UG9ZO,YAAA,CH8ZP;EAE6B;;;;gBG3ZvB;;;AFzFhB;AAMY,UE4FK,kBAAA,CF5FL;EAEe;EAOZ,KAAA,CAAA,EAAA,OAAA;EAAgB;EAgHd,KAAA,CAAA,EAAA,OAAY;EAUZ;EASK,UAAA,CAAA,EAAA,OAAgB;EAGR;EACgB,GAAA,CAAA,EAAA,OAAA;EAUX;EAQE,SAAA,CAAA,EAAA,OAAA;EAAf;EA2CE,cAAA,CAAA,EErGL,MFqGK,EAAA;;AAWiE,UE7GxE,YAAA,CF6GwE;EAAZ;;;;iBExG5D;;ADlKjB;;;EAW8B,aAAA,CAAA,EAAA,MAAA;EAVpB;;AAqBV;;EAS0C,SAAA,CAAA,EAAA,MAAA;EAAU;;;;EAgDW,uBAAA,CAAA,EAAA,OAAA;;AAxDrD,UCiKO,oBAAA,CDjKP;EACG;EAAiB,KAAA,CAAA,ECkKpB,cDlKoB;EAiFd;EACc,KAAA,CAAA,EAAA,MAAA;EAAU;EAAW,UAAA,CAAA,EAAA,MAAA;EAAzC;;;;;;AC7EV;AAcA;;AAKyD,UAyJxC,cAAA,CAzJwC;EAAR,GAAA,CAAA,GAAA,EAAA,MAAA,CAAA,EA0J7B,OA1J6B,CA0JrB,eA1JqB,GAAA,SAAA,CAAA;EAAO,GAAA,CAAA,GAAA,EAAA,MAAA,EAAA,KAAA,EA2J9B,eA3J8B,EAAA,OAUvC,CAVuC,EAAA;IAGvC,KAAA,CAAA,EAAA,MAAY;EAOZ,CAAA,CAAA,EAiJyD,OAjJzD,CAAA,IAAa,CAAA;EAYlB,MAAA,CAAA,GAAA,EAAA,MAAA,CAAA,EAsIW,OAtIO,CAAA,OAAA,CAAA;EAEb,KAAA,EAAA,EAqIN,OArIM,CAAA,IAAA,CAAA;;AAUY,UAkIZ,WAAA,CAlIY;EAAR;;AAWrB;AA6BA;EAMiB,WAAA,CAAA,EAAA,MAAY;EAcZ;AAejB;AA2BA;AAiBA;EAC4B,SAAA,CAAA,EAAA,MAAA;EAAR;;;;EAGT,iBAAA,CAAA,EAAA,MAAA;;AAOM,UAkBA,oBAAA,CAlBW;EAkBX;AAajB;AAOA;;EAQe,gBAAA,CAAA,EAAA,MAAA;EAGkB;;;;EAAY,cAAA,CAAA,EAAA,MAAA;AAI7C;AAEU,UAxBO,eAAA,CAwBP;EAES;EAEL,iBAAA,CAAA,EAAA,MAAA;EAKJ;EAAe,eAAA,CAAA,EAAA,MAAA;AAiBzB;AAEY,UA7CK,eAAA,CA6CL;EAIF,cAAA,CAAA,EAAA;IAEG,MAAA,EAAA,EAAA,OAAA;IAEF,aAAA,EAAA,EAAA,IAAA;IAEA,aAAA,EAAA,EAAA,IAAA;IAED,QAAA,GAAA,EApDO,YAoDP;EAEK,CAAA;EAIK,WAAA,CAAA,EAAA;IAA6B,OAAA,EAAA,EAvDlC,OAuDkC,CAAA,IAAA,CAAA;EAIzB,CAAA;EAAgB,SAAA,CAAA,EAAA;IAOvB,OAAA,CAAA,CAAA,CAAA,CAAA,EAAA,EAAe,GAAA,GA/DP,OA+DO,CA/DC,CA+DD,CAAA,CAAA,EA/DM,OA+DN,CA/Dc,CA+Dd,CAAA;EAkBf,CAAA;AAUjB;AAUiB,UAjGA,gBAAA,CAiGgB;EAUrB;EAAkB,KAAA,CAAA,EAzGpB,WAyGoB;EAAyB;EAA2B,cAAA,CAAA,EAvG/D,oBAuG+D;EAAgB;EAMjF,SAAA,CAAA,EA3GH,eA2GmB;EA4BhB;AAcjB;AAKA;;UArJU;;;;;;;;;;;;UAiBO,gBAAA;;YAEL;;;;UAIF;;aAEG;;WAEF;;WAEA;;UAED;;eAEK;;;;oBAIK,6BAA6B;;;;wBAIzB;;UAOP,eAAA;;;;;;;;;;;;;;;;;UAkBA,sBAAA;;;;UAIP;;;;;UAMO,wBAAA;;;;UAIP;;;;;UAMO,gBAAA;;;UAGP,QAAQ;;;;;;KAON,eAAA,GAAkB,yBAAyB,2BAA2B;UAMjE,gBAAA;;;;;;;;;;;;;;;;;;;;;;;UA4BA,SAAA;;;;;;;;;;;;;KAcL,YAAA;;;;UAKK,mBAAA;SACR"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "scrapex",
|
|
3
|
-
"version": "1.0.0-
|
|
3
|
+
"version": "1.0.0-beta.2",
|
|
4
4
|
"description": "Modern web scraper with LLM-enhanced extraction, extensible pipeline, and pluggable parsers",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"exports": {
|
|
@@ -18,6 +18,11 @@
|
|
|
18
18
|
"types": "./dist/llm/index.d.mts",
|
|
19
19
|
"import": "./dist/llm/index.mjs",
|
|
20
20
|
"require": "./dist/llm/index.cjs"
|
|
21
|
+
},
|
|
22
|
+
"./embeddings": {
|
|
23
|
+
"types": "./dist/embeddings/index.d.mts",
|
|
24
|
+
"import": "./dist/embeddings/index.mjs",
|
|
25
|
+
"require": "./dist/embeddings/index.cjs"
|
|
21
26
|
}
|
|
22
27
|
},
|
|
23
28
|
"main": "./dist/index.cjs",
|
|
@@ -53,7 +58,9 @@
|
|
|
53
58
|
"extraction",
|
|
54
59
|
"readability",
|
|
55
60
|
"markdown",
|
|
56
|
-
"parser"
|
|
61
|
+
"parser",
|
|
62
|
+
"embeddings",
|
|
63
|
+
"vector-search"
|
|
57
64
|
],
|
|
58
65
|
"author": "Rakesh Paul <https://binaryroute.com/authors/rk-paul/>",
|
|
59
66
|
"license": "MIT",
|
|
@@ -63,35 +70,27 @@
|
|
|
63
70
|
"dependencies": {
|
|
64
71
|
"@mozilla/readability": "^0.6.0",
|
|
65
72
|
"cheerio": "^1.1.2",
|
|
66
|
-
"jsdom": "^27.
|
|
73
|
+
"jsdom": "^27.4.0",
|
|
67
74
|
"mdast-util-from-markdown": "^2.0.2",
|
|
68
75
|
"mdast-util-to-string": "^4.0.0",
|
|
69
76
|
"turndown": "^7.2.2",
|
|
70
77
|
"unist-util-visit": "^5.0.0",
|
|
71
|
-
"zod": "^4.
|
|
78
|
+
"zod": "^4.3.4"
|
|
72
79
|
},
|
|
73
80
|
"devDependencies": {
|
|
74
|
-
"@biomejs/biome": "^2.3.
|
|
81
|
+
"@biomejs/biome": "^2.3.10",
|
|
75
82
|
"@types/jsdom": "^27.0.0",
|
|
76
83
|
"@types/mdast": "^4.0.4",
|
|
77
84
|
"@types/node": "^22.10.0",
|
|
78
85
|
"@types/turndown": "^5.0.6",
|
|
79
|
-
"tsdown": "^0.
|
|
86
|
+
"tsdown": "^0.18.4",
|
|
80
87
|
"typescript": "^5.9.3",
|
|
81
|
-
"vitest": "^4.0.
|
|
88
|
+
"vitest": "^4.0.16"
|
|
82
89
|
},
|
|
83
90
|
"peerDependencies": {
|
|
84
|
-
"
|
|
85
|
-
"openai": ">=4.0.0",
|
|
86
|
-
"puppeteer": ">=23.0.0"
|
|
91
|
+
"puppeteer": "^24.34.0"
|
|
87
92
|
},
|
|
88
93
|
"peerDependenciesMeta": {
|
|
89
|
-
"@anthropic-ai/sdk": {
|
|
90
|
-
"optional": true
|
|
91
|
-
},
|
|
92
|
-
"openai": {
|
|
93
|
-
"optional": true
|
|
94
|
-
},
|
|
95
94
|
"puppeteer": {
|
|
96
95
|
"optional": true
|
|
97
96
|
}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"enhancer-Q6CSc1gA.mjs","names":["results: Partial<ScrapedData>","promises: Promise<void>[]","zodShape: Record<string, z.ZodTypeAny>","zodType: z.ZodTypeAny","prompt: string"],"sources":["../src/core/errors.ts","../src/llm/types.ts","../src/llm/enhancer.ts"],"sourcesContent":["/**\n * Error codes for scraping failures\n */\nexport type ScrapeErrorCode =\n | 'FETCH_FAILED'\n | 'TIMEOUT'\n | 'INVALID_URL'\n | 'BLOCKED'\n | 'NOT_FOUND'\n | 'ROBOTS_BLOCKED'\n | 'PARSE_ERROR'\n | 'LLM_ERROR'\n | 'VALIDATION_ERROR';\n\n/**\n * Custom error class for scraping failures with structured error codes\n */\nexport class ScrapeError extends Error {\n public readonly code: ScrapeErrorCode;\n public readonly statusCode?: number;\n\n constructor(message: string, code: ScrapeErrorCode, statusCode?: number, cause?: Error) {\n super(message, { cause });\n this.name = 'ScrapeError';\n this.code = code;\n this.statusCode = statusCode;\n\n // Maintains proper stack trace in V8 environments\n if (Error.captureStackTrace) {\n Error.captureStackTrace(this, ScrapeError);\n }\n }\n\n /**\n * Create a ScrapeError from an unknown error\n */\n static from(error: unknown, code: ScrapeErrorCode = 'FETCH_FAILED'): ScrapeError {\n if (error instanceof ScrapeError) {\n return error;\n }\n\n if (error instanceof Error) {\n return new ScrapeError(error.message, code, undefined, error);\n }\n\n return new ScrapeError(String(error), code);\n }\n\n /**\n * Check if error is retryable (network issues, timeouts)\n */\n isRetryable(): boolean {\n return this.code === 'FETCH_FAILED' || this.code === 'TIMEOUT';\n }\n\n /**\n * Convert to a plain object for serialization\n */\n toJSON(): Record<string, unknown> {\n return {\n name: this.name,\n message: this.message,\n code: this.code,\n statusCode: this.statusCode,\n stack: this.stack,\n };\n }\n}\n","import { z } from 'zod';\n\n/**\n * LLM completion options\n */\nexport interface CompletionOptions {\n maxTokens?: number;\n temperature?: number;\n systemPrompt?: string;\n}\n\n/**\n * LLM Provider interface - implemented by all providers\n */\nexport interface LLMProvider {\n readonly name: string;\n\n /**\n * Generate a text completion\n */\n complete(prompt: string, options?: CompletionOptions): Promise<string>;\n\n /**\n * Generate a structured JSON completion with Zod validation\n */\n completeJSON<T>(prompt: string, schema: z.ZodType<T>, options?: CompletionOptions): Promise<T>;\n}\n\n/**\n * Provider configuration for Anthropic\n */\nexport interface AnthropicConfig {\n apiKey?: string; // Falls back to ANTHROPIC_API_KEY env var\n model?: string; // Default: claude-3-haiku-20240307\n baseUrl?: string;\n}\n\n/**\n * Provider configuration for OpenAI-compatible APIs\n * Works with: OpenAI, Ollama, LM Studio, LocalAI, vLLM, etc.\n */\nexport interface OpenAICompatibleConfig {\n apiKey?: string; // Falls back to OPENAI_API_KEY env var\n model?: string; // Default: gpt-4o-mini\n baseUrl?: string; // Default: https://api.openai.com/v1\n}\n\n/**\n * Enhancement result types\n */\nexport interface SummaryResult {\n summary: string;\n}\n\nexport interface TagsResult {\n tags: string[];\n}\n\nexport interface EntitiesResult {\n people: string[];\n organizations: string[];\n technologies: string[];\n locations: string[];\n concepts: string[];\n}\n\nexport interface ClassifyResult {\n contentType: string;\n confidence: number;\n}\n\n/**\n * Zod schemas for LLM outputs\n */\nexport const SummarySchema = z.object({\n summary: z.string().describe('A concise 2-3 sentence summary of the content'),\n});\n\nexport const TagsSchema = z.object({\n tags: z.array(z.string()).describe('5-10 relevant tags/keywords'),\n});\n\nexport const EntitiesSchema = z.object({\n people: z.array(z.string()).describe('People mentioned'),\n organizations: z.array(z.string()).describe('Organizations/companies'),\n technologies: z.array(z.string()).describe('Technologies/tools/frameworks'),\n locations: z.array(z.string()).describe('Locations/places'),\n concepts: z.array(z.string()).describe('Key concepts/topics'),\n});\n\nexport const ClassifySchema = z.object({\n contentType: z\n .enum(['article', 'repo', 'docs', 'package', 'video', 'tool', 'product', 'unknown'])\n .describe('The type of content'),\n confidence: z.number().min(0).max(1).describe('Confidence score 0-1'),\n});\n","import { z } from 'zod';\nimport type {\n EnhancementType,\n ExtractedEntities,\n ExtractionSchema,\n ScrapedData,\n} from '@/core/types.js';\nimport type { LLMProvider } from './types.js';\nimport { ClassifySchema, EntitiesSchema, SummarySchema, TagsSchema } from './types.js';\n\n/**\n * Enhance scraped data with LLM-powered features\n */\nexport async function enhance(\n data: ScrapedData,\n provider: LLMProvider,\n types: EnhancementType[]\n): Promise<Partial<ScrapedData>> {\n const results: Partial<ScrapedData> = {};\n\n // Prepare content for LLM (use excerpt/textContent to save tokens)\n const content = data.excerpt || data.textContent.slice(0, 10000);\n const context = `Title: ${data.title}\\nURL: ${data.url}\\n\\nContent:\\n${content}`;\n\n // Run enhancements in parallel\n const promises: Promise<void>[] = [];\n\n if (types.includes('summarize')) {\n promises.push(\n summarize(context, provider).then((summary) => {\n results.summary = summary;\n })\n );\n }\n\n if (types.includes('tags')) {\n promises.push(\n extractTags(context, provider).then((tags) => {\n results.suggestedTags = tags;\n })\n );\n }\n\n if (types.includes('entities')) {\n promises.push(\n extractEntities(context, provider).then((entities) => {\n results.entities = entities;\n })\n );\n }\n\n if (types.includes('classify')) {\n promises.push(\n classify(context, provider).then((classification) => {\n if (classification.confidence > 0.7) {\n results.contentType = classification.contentType as ScrapedData['contentType'];\n }\n })\n );\n }\n\n await Promise.all(promises);\n\n return results;\n}\n\n/**\n * Options for the ask() function\n */\nexport interface AskOptions {\n /** Key to store the result under in custom field */\n key?: string;\n /** Schema for structured response */\n schema?: ExtractionSchema;\n}\n\n/**\n * Ask a custom question about the scraped content\n * Results are stored in the `custom` field of ScrapedData\n */\nexport async function ask(\n data: ScrapedData,\n provider: LLMProvider,\n prompt: string,\n options?: AskOptions\n): Promise<Partial<ScrapedData>> {\n const key = options?.key || 'response';\n const content = data.excerpt || data.textContent.slice(0, 10000);\n\n // Apply placeholder replacements\n const processedPrompt = applyPlaceholders(prompt, data, content);\n\n if (options?.schema) {\n // Use structured extraction\n const result = await extract(data, provider, options.schema, processedPrompt);\n return { custom: { [key]: result } };\n }\n\n // Simple string response\n const fullPrompt = prompt.includes('{{content}}')\n ? processedPrompt\n : `${processedPrompt}\\n\\nTitle: ${data.title}\\nURL: ${data.url}\\n\\nContent:\\n${content}`;\n\n const response = await provider.complete(fullPrompt);\n return { custom: { [key]: response } };\n}\n\n/**\n * Apply placeholder replacements to a prompt template\n */\nfunction applyPlaceholders(prompt: string, data: ScrapedData, content: string): string {\n const domain = (() => {\n try {\n return new URL(data.url).hostname;\n } catch {\n return '';\n }\n })();\n\n return prompt\n .replace(/\\{\\{title\\}\\}/g, data.title)\n .replace(/\\{\\{url\\}\\}/g, data.url)\n .replace(/\\{\\{content\\}\\}/g, content)\n .replace(/\\{\\{description\\}\\}/g, data.description || '')\n .replace(/\\{\\{excerpt\\}\\}/g, data.excerpt || '')\n .replace(/\\{\\{domain\\}\\}/g, domain);\n}\n\n/**\n * Extract structured data using LLM and a custom schema\n */\nexport async function extract<T>(\n data: ScrapedData,\n provider: LLMProvider,\n schema: ExtractionSchema,\n promptTemplate?: string\n): Promise<T> {\n // Convert simple schema to Zod schema\n const zodShape: Record<string, z.ZodTypeAny> = {};\n\n for (const [key, type] of Object.entries(schema)) {\n const isOptional = type.endsWith('?');\n const baseType = isOptional ? type.slice(0, -1) : type;\n\n let zodType: z.ZodTypeAny;\n switch (baseType) {\n case 'string':\n zodType = z.string();\n break;\n case 'number':\n zodType = z.number();\n break;\n case 'boolean':\n zodType = z.boolean();\n break;\n case 'string[]':\n zodType = z.array(z.string());\n break;\n case 'number[]':\n zodType = z.array(z.number());\n break;\n default:\n zodType = z.string();\n }\n\n zodShape[key] = isOptional ? zodType.optional() : zodType;\n }\n\n const zodSchema = z.object(zodShape) as unknown as z.ZodType<T>;\n\n const content = data.textContent.slice(0, 4000);\n\n let prompt: string;\n\n if (promptTemplate) {\n // Apply all placeholder replacements\n prompt = applyPlaceholders(promptTemplate, data, content);\n\n // If content wasn't included via placeholder, append it\n if (!promptTemplate.includes('{{content}}')) {\n prompt += `\\n\\nContext:\\n${content}`;\n }\n } else {\n prompt = `Extract the following information from this content:\n\nTitle: ${data.title}\nURL: ${data.url}\n\nContent:\n${content}\n\nExtract these fields:\n${Object.entries(schema)\n .map(([key, type]) => `- ${key} (${type})`)\n .join('\\n')}`;\n }\n\n return provider.completeJSON<T>(prompt, zodSchema as z.ZodType<T>);\n}\n\n/**\n * Generate a summary of the content\n */\nasync function summarize(context: string, provider: LLMProvider): Promise<string> {\n const prompt = `Summarize the following content in 2-3 concise sentences:\n\n${context}`;\n\n const result = await provider.completeJSON(prompt, SummarySchema);\n return result.summary;\n}\n\n/**\n * Extract relevant tags/keywords\n */\nasync function extractTags(context: string, provider: LLMProvider): Promise<string[]> {\n const prompt = `Extract 5-10 relevant tags or keywords from the following content. Focus on technologies, concepts, and topics mentioned:\n\n${context}`;\n\n const result = await provider.completeJSON(prompt, TagsSchema);\n return result.tags;\n}\n\n/**\n * Extract named entities from content\n */\nasync function extractEntities(context: string, provider: LLMProvider): Promise<ExtractedEntities> {\n const prompt = `Extract named entities from the following content. Identify people, organizations, technologies, locations, and key concepts:\n\n${context}`;\n\n return provider.completeJSON(prompt, EntitiesSchema);\n}\n\n/**\n * Classify content type using LLM\n */\nasync function classify(\n context: string,\n provider: LLMProvider\n): Promise<{ contentType: string; confidence: number }> {\n const prompt = `Classify the following content into one of these categories:\n- article: Blog post, news article, essay\n- repo: Code repository, open source project\n- docs: Documentation, API reference, guides\n- package: npm/pip package page\n- video: Video content, YouTube\n- tool: Software tool, web application\n- product: Commercial product, e-commerce\n\n${context}`;\n\n return provider.completeJSON(prompt, ClassifySchema);\n}\n"],"mappings":";;;;;;AAiBA,IAAa,cAAb,MAAa,oBAAoB,MAAM;CACrC,AAAgB;CAChB,AAAgB;CAEhB,YAAY,SAAiB,MAAuB,YAAqB,OAAe;AACtF,QAAM,SAAS,EAAE,OAAO,CAAC;AACzB,OAAK,OAAO;AACZ,OAAK,OAAO;AACZ,OAAK,aAAa;AAGlB,MAAI,MAAM,kBACR,OAAM,kBAAkB,MAAM,YAAY;;;;;CAO9C,OAAO,KAAK,OAAgB,OAAwB,gBAA6B;AAC/E,MAAI,iBAAiB,YACnB,QAAO;AAGT,MAAI,iBAAiB,MACnB,QAAO,IAAI,YAAY,MAAM,SAAS,MAAM,QAAW,MAAM;AAG/D,SAAO,IAAI,YAAY,OAAO,MAAM,EAAE,KAAK;;;;;CAM7C,cAAuB;AACrB,SAAO,KAAK,SAAS,kBAAkB,KAAK,SAAS;;;;;CAMvD,SAAkC;AAChC,SAAO;GACL,MAAM,KAAK;GACX,SAAS,KAAK;GACd,MAAM,KAAK;GACX,YAAY,KAAK;GACjB,OAAO,KAAK;GACb;;;;;;;;;ACSL,MAAa,gBAAgB,EAAE,OAAO,EACpC,SAAS,EAAE,QAAQ,CAAC,SAAS,gDAAgD,EAC9E,CAAC;AAEF,MAAa,aAAa,EAAE,OAAO,EACjC,MAAM,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC,SAAS,8BAA8B,EAClE,CAAC;AAEF,MAAa,iBAAiB,EAAE,OAAO;CACrC,QAAQ,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC,SAAS,mBAAmB;CACxD,eAAe,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC,SAAS,0BAA0B;CACtE,cAAc,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC,SAAS,gCAAgC;CAC3E,WAAW,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC,SAAS,mBAAmB;CAC3D,UAAU,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC,SAAS,sBAAsB;CAC9D,CAAC;AAEF,MAAa,iBAAiB,EAAE,OAAO;CACrC,aAAa,EACV,KAAK;EAAC;EAAW;EAAQ;EAAQ;EAAW;EAAS;EAAQ;EAAW;EAAU,CAAC,CACnF,SAAS,sBAAsB;CAClC,YAAY,EAAE,QAAQ,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,SAAS,uBAAuB;CACtE,CAAC;;;;;;;AClFF,eAAsB,QACpB,MACA,UACA,OAC+B;CAC/B,MAAMA,UAAgC,EAAE;CAGxC,MAAM,UAAU,KAAK,WAAW,KAAK,YAAY,MAAM,GAAG,IAAM;CAChE,MAAM,UAAU,UAAU,KAAK,MAAM,SAAS,KAAK,IAAI,gBAAgB;CAGvE,MAAMC,WAA4B,EAAE;AAEpC,KAAI,MAAM,SAAS,YAAY,CAC7B,UAAS,KACP,UAAU,SAAS,SAAS,CAAC,MAAM,YAAY;AAC7C,UAAQ,UAAU;GAClB,CACH;AAGH,KAAI,MAAM,SAAS,OAAO,CACxB,UAAS,KACP,YAAY,SAAS,SAAS,CAAC,MAAM,SAAS;AAC5C,UAAQ,gBAAgB;GACxB,CACH;AAGH,KAAI,MAAM,SAAS,WAAW,CAC5B,UAAS,KACP,gBAAgB,SAAS,SAAS,CAAC,MAAM,aAAa;AACpD,UAAQ,WAAW;GACnB,CACH;AAGH,KAAI,MAAM,SAAS,WAAW,CAC5B,UAAS,KACP,SAAS,SAAS,SAAS,CAAC,MAAM,mBAAmB;AACnD,MAAI,eAAe,aAAa,GAC9B,SAAQ,cAAc,eAAe;GAEvC,CACH;AAGH,OAAM,QAAQ,IAAI,SAAS;AAE3B,QAAO;;;;;;AAiBT,eAAsB,IACpB,MACA,UACA,QACA,SAC+B;CAC/B,MAAM,MAAM,SAAS,OAAO;CAC5B,MAAM,UAAU,KAAK,WAAW,KAAK,YAAY,MAAM,GAAG,IAAM;CAGhE,MAAM,kBAAkB,kBAAkB,QAAQ,MAAM,QAAQ;AAEhE,KAAI,SAAS,QAAQ;EAEnB,MAAM,SAAS,MAAM,QAAQ,MAAM,UAAU,QAAQ,QAAQ,gBAAgB;AAC7E,SAAO,EAAE,QAAQ,GAAG,MAAM,QAAQ,EAAE;;CAItC,MAAM,aAAa,OAAO,SAAS,cAAc,GAC7C,kBACA,GAAG,gBAAgB,aAAa,KAAK,MAAM,SAAS,KAAK,IAAI,gBAAgB;CAEjF,MAAM,WAAW,MAAM,SAAS,SAAS,WAAW;AACpD,QAAO,EAAE,QAAQ,GAAG,MAAM,UAAU,EAAE;;;;;AAMxC,SAAS,kBAAkB,QAAgB,MAAmB,SAAyB;CACrF,MAAM,gBAAgB;AACpB,MAAI;AACF,UAAO,IAAI,IAAI,KAAK,IAAI,CAAC;UACnB;AACN,UAAO;;KAEP;AAEJ,QAAO,OACJ,QAAQ,kBAAkB,KAAK,MAAM,CACrC,QAAQ,gBAAgB,KAAK,IAAI,CACjC,QAAQ,oBAAoB,QAAQ,CACpC,QAAQ,wBAAwB,KAAK,eAAe,GAAG,CACvD,QAAQ,oBAAoB,KAAK,WAAW,GAAG,CAC/C,QAAQ,mBAAmB,OAAO;;;;;AAMvC,eAAsB,QACpB,MACA,UACA,QACA,gBACY;CAEZ,MAAMC,WAAyC,EAAE;AAEjD,MAAK,MAAM,CAAC,KAAK,SAAS,OAAO,QAAQ,OAAO,EAAE;EAChD,MAAM,aAAa,KAAK,SAAS,IAAI;EACrC,MAAM,WAAW,aAAa,KAAK,MAAM,GAAG,GAAG,GAAG;EAElD,IAAIC;AACJ,UAAQ,UAAR;GACE,KAAK;AACH,cAAU,EAAE,QAAQ;AACpB;GACF,KAAK;AACH,cAAU,EAAE,QAAQ;AACpB;GACF,KAAK;AACH,cAAU,EAAE,SAAS;AACrB;GACF,KAAK;AACH,cAAU,EAAE,MAAM,EAAE,QAAQ,CAAC;AAC7B;GACF,KAAK;AACH,cAAU,EAAE,MAAM,EAAE,QAAQ,CAAC;AAC7B;GACF,QACE,WAAU,EAAE,QAAQ;;AAGxB,WAAS,OAAO,aAAa,QAAQ,UAAU,GAAG;;CAGpD,MAAM,YAAY,EAAE,OAAO,SAAS;CAEpC,MAAM,UAAU,KAAK,YAAY,MAAM,GAAG,IAAK;CAE/C,IAAIC;AAEJ,KAAI,gBAAgB;AAElB,WAAS,kBAAkB,gBAAgB,MAAM,QAAQ;AAGzD,MAAI,CAAC,eAAe,SAAS,cAAc,CACzC,WAAU,iBAAiB;OAG7B,UAAS;;SAEJ,KAAK,MAAM;OACb,KAAK,IAAI;;;EAGd,QAAQ;;;EAGR,OAAO,QAAQ,OAAO,CACrB,KAAK,CAAC,KAAK,UAAU,KAAK,IAAI,IAAI,KAAK,GAAG,CAC1C,KAAK,KAAK;AAGX,QAAO,SAAS,aAAgB,QAAQ,UAA0B;;;;;AAMpE,eAAe,UAAU,SAAiB,UAAwC;CAChF,MAAM,SAAS;;EAEf;AAGA,SADe,MAAM,SAAS,aAAa,QAAQ,cAAc,EACnD;;;;;AAMhB,eAAe,YAAY,SAAiB,UAA0C;CACpF,MAAM,SAAS;;EAEf;AAGA,SADe,MAAM,SAAS,aAAa,QAAQ,WAAW,EAChD;;;;;AAMhB,eAAe,gBAAgB,SAAiB,UAAmD;CACjG,MAAM,SAAS;;EAEf;AAEA,QAAO,SAAS,aAAa,QAAQ,eAAe;;;;;AAMtD,eAAe,SACb,SACA,UACsD;CACtD,MAAM,SAAS;;;;;;;;;EASf;AAEA,QAAO,SAAS,aAAa,QAAQ,eAAe"}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"enhancer-oM4BhYYS.cjs","names":["z","results: Partial<ScrapedData>","promises: Promise<void>[]","zodShape: Record<string, z.ZodTypeAny>","zodType: z.ZodTypeAny","z","prompt: string"],"sources":["../src/core/errors.ts","../src/llm/types.ts","../src/llm/enhancer.ts"],"sourcesContent":["/**\n * Error codes for scraping failures\n */\nexport type ScrapeErrorCode =\n | 'FETCH_FAILED'\n | 'TIMEOUT'\n | 'INVALID_URL'\n | 'BLOCKED'\n | 'NOT_FOUND'\n | 'ROBOTS_BLOCKED'\n | 'PARSE_ERROR'\n | 'LLM_ERROR'\n | 'VALIDATION_ERROR';\n\n/**\n * Custom error class for scraping failures with structured error codes\n */\nexport class ScrapeError extends Error {\n public readonly code: ScrapeErrorCode;\n public readonly statusCode?: number;\n\n constructor(message: string, code: ScrapeErrorCode, statusCode?: number, cause?: Error) {\n super(message, { cause });\n this.name = 'ScrapeError';\n this.code = code;\n this.statusCode = statusCode;\n\n // Maintains proper stack trace in V8 environments\n if (Error.captureStackTrace) {\n Error.captureStackTrace(this, ScrapeError);\n }\n }\n\n /**\n * Create a ScrapeError from an unknown error\n */\n static from(error: unknown, code: ScrapeErrorCode = 'FETCH_FAILED'): ScrapeError {\n if (error instanceof ScrapeError) {\n return error;\n }\n\n if (error instanceof Error) {\n return new ScrapeError(error.message, code, undefined, error);\n }\n\n return new ScrapeError(String(error), code);\n }\n\n /**\n * Check if error is retryable (network issues, timeouts)\n */\n isRetryable(): boolean {\n return this.code === 'FETCH_FAILED' || this.code === 'TIMEOUT';\n }\n\n /**\n * Convert to a plain object for serialization\n */\n toJSON(): Record<string, unknown> {\n return {\n name: this.name,\n message: this.message,\n code: this.code,\n statusCode: this.statusCode,\n stack: this.stack,\n };\n }\n}\n","import { z } from 'zod';\n\n/**\n * LLM completion options\n */\nexport interface CompletionOptions {\n maxTokens?: number;\n temperature?: number;\n systemPrompt?: string;\n}\n\n/**\n * LLM Provider interface - implemented by all providers\n */\nexport interface LLMProvider {\n readonly name: string;\n\n /**\n * Generate a text completion\n */\n complete(prompt: string, options?: CompletionOptions): Promise<string>;\n\n /**\n * Generate a structured JSON completion with Zod validation\n */\n completeJSON<T>(prompt: string, schema: z.ZodType<T>, options?: CompletionOptions): Promise<T>;\n}\n\n/**\n * Provider configuration for Anthropic\n */\nexport interface AnthropicConfig {\n apiKey?: string; // Falls back to ANTHROPIC_API_KEY env var\n model?: string; // Default: claude-3-haiku-20240307\n baseUrl?: string;\n}\n\n/**\n * Provider configuration for OpenAI-compatible APIs\n * Works with: OpenAI, Ollama, LM Studio, LocalAI, vLLM, etc.\n */\nexport interface OpenAICompatibleConfig {\n apiKey?: string; // Falls back to OPENAI_API_KEY env var\n model?: string; // Default: gpt-4o-mini\n baseUrl?: string; // Default: https://api.openai.com/v1\n}\n\n/**\n * Enhancement result types\n */\nexport interface SummaryResult {\n summary: string;\n}\n\nexport interface TagsResult {\n tags: string[];\n}\n\nexport interface EntitiesResult {\n people: string[];\n organizations: string[];\n technologies: string[];\n locations: string[];\n concepts: string[];\n}\n\nexport interface ClassifyResult {\n contentType: string;\n confidence: number;\n}\n\n/**\n * Zod schemas for LLM outputs\n */\nexport const SummarySchema = z.object({\n summary: z.string().describe('A concise 2-3 sentence summary of the content'),\n});\n\nexport const TagsSchema = z.object({\n tags: z.array(z.string()).describe('5-10 relevant tags/keywords'),\n});\n\nexport const EntitiesSchema = z.object({\n people: z.array(z.string()).describe('People mentioned'),\n organizations: z.array(z.string()).describe('Organizations/companies'),\n technologies: z.array(z.string()).describe('Technologies/tools/frameworks'),\n locations: z.array(z.string()).describe('Locations/places'),\n concepts: z.array(z.string()).describe('Key concepts/topics'),\n});\n\nexport const ClassifySchema = z.object({\n contentType: z\n .enum(['article', 'repo', 'docs', 'package', 'video', 'tool', 'product', 'unknown'])\n .describe('The type of content'),\n confidence: z.number().min(0).max(1).describe('Confidence score 0-1'),\n});\n","import { z } from 'zod';\nimport type {\n EnhancementType,\n ExtractedEntities,\n ExtractionSchema,\n ScrapedData,\n} from '@/core/types.js';\nimport type { LLMProvider } from './types.js';\nimport { ClassifySchema, EntitiesSchema, SummarySchema, TagsSchema } from './types.js';\n\n/**\n * Enhance scraped data with LLM-powered features\n */\nexport async function enhance(\n data: ScrapedData,\n provider: LLMProvider,\n types: EnhancementType[]\n): Promise<Partial<ScrapedData>> {\n const results: Partial<ScrapedData> = {};\n\n // Prepare content for LLM (use excerpt/textContent to save tokens)\n const content = data.excerpt || data.textContent.slice(0, 10000);\n const context = `Title: ${data.title}\\nURL: ${data.url}\\n\\nContent:\\n${content}`;\n\n // Run enhancements in parallel\n const promises: Promise<void>[] = [];\n\n if (types.includes('summarize')) {\n promises.push(\n summarize(context, provider).then((summary) => {\n results.summary = summary;\n })\n );\n }\n\n if (types.includes('tags')) {\n promises.push(\n extractTags(context, provider).then((tags) => {\n results.suggestedTags = tags;\n })\n );\n }\n\n if (types.includes('entities')) {\n promises.push(\n extractEntities(context, provider).then((entities) => {\n results.entities = entities;\n })\n );\n }\n\n if (types.includes('classify')) {\n promises.push(\n classify(context, provider).then((classification) => {\n if (classification.confidence > 0.7) {\n results.contentType = classification.contentType as ScrapedData['contentType'];\n }\n })\n );\n }\n\n await Promise.all(promises);\n\n return results;\n}\n\n/**\n * Options for the ask() function\n */\nexport interface AskOptions {\n /** Key to store the result under in custom field */\n key?: string;\n /** Schema for structured response */\n schema?: ExtractionSchema;\n}\n\n/**\n * Ask a custom question about the scraped content\n * Results are stored in the `custom` field of ScrapedData\n */\nexport async function ask(\n data: ScrapedData,\n provider: LLMProvider,\n prompt: string,\n options?: AskOptions\n): Promise<Partial<ScrapedData>> {\n const key = options?.key || 'response';\n const content = data.excerpt || data.textContent.slice(0, 10000);\n\n // Apply placeholder replacements\n const processedPrompt = applyPlaceholders(prompt, data, content);\n\n if (options?.schema) {\n // Use structured extraction\n const result = await extract(data, provider, options.schema, processedPrompt);\n return { custom: { [key]: result } };\n }\n\n // Simple string response\n const fullPrompt = prompt.includes('{{content}}')\n ? processedPrompt\n : `${processedPrompt}\\n\\nTitle: ${data.title}\\nURL: ${data.url}\\n\\nContent:\\n${content}`;\n\n const response = await provider.complete(fullPrompt);\n return { custom: { [key]: response } };\n}\n\n/**\n * Apply placeholder replacements to a prompt template\n */\nfunction applyPlaceholders(prompt: string, data: ScrapedData, content: string): string {\n const domain = (() => {\n try {\n return new URL(data.url).hostname;\n } catch {\n return '';\n }\n })();\n\n return prompt\n .replace(/\\{\\{title\\}\\}/g, data.title)\n .replace(/\\{\\{url\\}\\}/g, data.url)\n .replace(/\\{\\{content\\}\\}/g, content)\n .replace(/\\{\\{description\\}\\}/g, data.description || '')\n .replace(/\\{\\{excerpt\\}\\}/g, data.excerpt || '')\n .replace(/\\{\\{domain\\}\\}/g, domain);\n}\n\n/**\n * Extract structured data using LLM and a custom schema\n */\nexport async function extract<T>(\n data: ScrapedData,\n provider: LLMProvider,\n schema: ExtractionSchema,\n promptTemplate?: string\n): Promise<T> {\n // Convert simple schema to Zod schema\n const zodShape: Record<string, z.ZodTypeAny> = {};\n\n for (const [key, type] of Object.entries(schema)) {\n const isOptional = type.endsWith('?');\n const baseType = isOptional ? type.slice(0, -1) : type;\n\n let zodType: z.ZodTypeAny;\n switch (baseType) {\n case 'string':\n zodType = z.string();\n break;\n case 'number':\n zodType = z.number();\n break;\n case 'boolean':\n zodType = z.boolean();\n break;\n case 'string[]':\n zodType = z.array(z.string());\n break;\n case 'number[]':\n zodType = z.array(z.number());\n break;\n default:\n zodType = z.string();\n }\n\n zodShape[key] = isOptional ? zodType.optional() : zodType;\n }\n\n const zodSchema = z.object(zodShape) as unknown as z.ZodType<T>;\n\n const content = data.textContent.slice(0, 4000);\n\n let prompt: string;\n\n if (promptTemplate) {\n // Apply all placeholder replacements\n prompt = applyPlaceholders(promptTemplate, data, content);\n\n // If content wasn't included via placeholder, append it\n if (!promptTemplate.includes('{{content}}')) {\n prompt += `\\n\\nContext:\\n${content}`;\n }\n } else {\n prompt = `Extract the following information from this content:\n\nTitle: ${data.title}\nURL: ${data.url}\n\nContent:\n${content}\n\nExtract these fields:\n${Object.entries(schema)\n .map(([key, type]) => `- ${key} (${type})`)\n .join('\\n')}`;\n }\n\n return provider.completeJSON<T>(prompt, zodSchema as z.ZodType<T>);\n}\n\n/**\n * Generate a summary of the content\n */\nasync function summarize(context: string, provider: LLMProvider): Promise<string> {\n const prompt = `Summarize the following content in 2-3 concise sentences:\n\n${context}`;\n\n const result = await provider.completeJSON(prompt, SummarySchema);\n return result.summary;\n}\n\n/**\n * Extract relevant tags/keywords\n */\nasync function extractTags(context: string, provider: LLMProvider): Promise<string[]> {\n const prompt = `Extract 5-10 relevant tags or keywords from the following content. Focus on technologies, concepts, and topics mentioned:\n\n${context}`;\n\n const result = await provider.completeJSON(prompt, TagsSchema);\n return result.tags;\n}\n\n/**\n * Extract named entities from content\n */\nasync function extractEntities(context: string, provider: LLMProvider): Promise<ExtractedEntities> {\n const prompt = `Extract named entities from the following content. Identify people, organizations, technologies, locations, and key concepts:\n\n${context}`;\n\n return provider.completeJSON(prompt, EntitiesSchema);\n}\n\n/**\n * Classify content type using LLM\n */\nasync function classify(\n context: string,\n provider: LLMProvider\n): Promise<{ contentType: string; confidence: number }> {\n const prompt = `Classify the following content into one of these categories:\n- article: Blog post, news article, essay\n- repo: Code repository, open source project\n- docs: Documentation, API reference, guides\n- package: npm/pip package page\n- video: Video content, YouTube\n- tool: Software tool, web application\n- product: Commercial product, e-commerce\n\n${context}`;\n\n return provider.completeJSON(prompt, ClassifySchema);\n}\n"],"mappings":";;;;;;;AAiBA,IAAa,cAAb,MAAa,oBAAoB,MAAM;CACrC,AAAgB;CAChB,AAAgB;CAEhB,YAAY,SAAiB,MAAuB,YAAqB,OAAe;AACtF,QAAM,SAAS,EAAE,OAAO,CAAC;AACzB,OAAK,OAAO;AACZ,OAAK,OAAO;AACZ,OAAK,aAAa;AAGlB,MAAI,MAAM,kBACR,OAAM,kBAAkB,MAAM,YAAY;;;;;CAO9C,OAAO,KAAK,OAAgB,OAAwB,gBAA6B;AAC/E,MAAI,iBAAiB,YACnB,QAAO;AAGT,MAAI,iBAAiB,MACnB,QAAO,IAAI,YAAY,MAAM,SAAS,MAAM,QAAW,MAAM;AAG/D,SAAO,IAAI,YAAY,OAAO,MAAM,EAAE,KAAK;;;;;CAM7C,cAAuB;AACrB,SAAO,KAAK,SAAS,kBAAkB,KAAK,SAAS;;;;;CAMvD,SAAkC;AAChC,SAAO;GACL,MAAM,KAAK;GACX,SAAS,KAAK;GACd,MAAM,KAAK;GACX,YAAY,KAAK;GACjB,OAAO,KAAK;GACb;;;;;;;;;ACSL,MAAa,gBAAgBA,MAAE,OAAO,EACpC,SAASA,MAAE,QAAQ,CAAC,SAAS,gDAAgD,EAC9E,CAAC;AAEF,MAAa,aAAaA,MAAE,OAAO,EACjC,MAAMA,MAAE,MAAMA,MAAE,QAAQ,CAAC,CAAC,SAAS,8BAA8B,EAClE,CAAC;AAEF,MAAa,iBAAiBA,MAAE,OAAO;CACrC,QAAQA,MAAE,MAAMA,MAAE,QAAQ,CAAC,CAAC,SAAS,mBAAmB;CACxD,eAAeA,MAAE,MAAMA,MAAE,QAAQ,CAAC,CAAC,SAAS,0BAA0B;CACtE,cAAcA,MAAE,MAAMA,MAAE,QAAQ,CAAC,CAAC,SAAS,gCAAgC;CAC3E,WAAWA,MAAE,MAAMA,MAAE,QAAQ,CAAC,CAAC,SAAS,mBAAmB;CAC3D,UAAUA,MAAE,MAAMA,MAAE,QAAQ,CAAC,CAAC,SAAS,sBAAsB;CAC9D,CAAC;AAEF,MAAa,iBAAiBA,MAAE,OAAO;CACrC,aAAaA,MACV,KAAK;EAAC;EAAW;EAAQ;EAAQ;EAAW;EAAS;EAAQ;EAAW;EAAU,CAAC,CACnF,SAAS,sBAAsB;CAClC,YAAYA,MAAE,QAAQ,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,SAAS,uBAAuB;CACtE,CAAC;;;;;;;AClFF,eAAsB,QACpB,MACA,UACA,OAC+B;CAC/B,MAAMC,UAAgC,EAAE;CAGxC,MAAM,UAAU,KAAK,WAAW,KAAK,YAAY,MAAM,GAAG,IAAM;CAChE,MAAM,UAAU,UAAU,KAAK,MAAM,SAAS,KAAK,IAAI,gBAAgB;CAGvE,MAAMC,WAA4B,EAAE;AAEpC,KAAI,MAAM,SAAS,YAAY,CAC7B,UAAS,KACP,UAAU,SAAS,SAAS,CAAC,MAAM,YAAY;AAC7C,UAAQ,UAAU;GAClB,CACH;AAGH,KAAI,MAAM,SAAS,OAAO,CACxB,UAAS,KACP,YAAY,SAAS,SAAS,CAAC,MAAM,SAAS;AAC5C,UAAQ,gBAAgB;GACxB,CACH;AAGH,KAAI,MAAM,SAAS,WAAW,CAC5B,UAAS,KACP,gBAAgB,SAAS,SAAS,CAAC,MAAM,aAAa;AACpD,UAAQ,WAAW;GACnB,CACH;AAGH,KAAI,MAAM,SAAS,WAAW,CAC5B,UAAS,KACP,SAAS,SAAS,SAAS,CAAC,MAAM,mBAAmB;AACnD,MAAI,eAAe,aAAa,GAC9B,SAAQ,cAAc,eAAe;GAEvC,CACH;AAGH,OAAM,QAAQ,IAAI,SAAS;AAE3B,QAAO;;;;;;AAiBT,eAAsB,IACpB,MACA,UACA,QACA,SAC+B;CAC/B,MAAM,MAAM,SAAS,OAAO;CAC5B,MAAM,UAAU,KAAK,WAAW,KAAK,YAAY,MAAM,GAAG,IAAM;CAGhE,MAAM,kBAAkB,kBAAkB,QAAQ,MAAM,QAAQ;AAEhE,KAAI,SAAS,QAAQ;EAEnB,MAAM,SAAS,MAAM,QAAQ,MAAM,UAAU,QAAQ,QAAQ,gBAAgB;AAC7E,SAAO,EAAE,QAAQ,GAAG,MAAM,QAAQ,EAAE;;CAItC,MAAM,aAAa,OAAO,SAAS,cAAc,GAC7C,kBACA,GAAG,gBAAgB,aAAa,KAAK,MAAM,SAAS,KAAK,IAAI,gBAAgB;CAEjF,MAAM,WAAW,MAAM,SAAS,SAAS,WAAW;AACpD,QAAO,EAAE,QAAQ,GAAG,MAAM,UAAU,EAAE;;;;;AAMxC,SAAS,kBAAkB,QAAgB,MAAmB,SAAyB;CACrF,MAAM,gBAAgB;AACpB,MAAI;AACF,UAAO,IAAI,IAAI,KAAK,IAAI,CAAC;UACnB;AACN,UAAO;;KAEP;AAEJ,QAAO,OACJ,QAAQ,kBAAkB,KAAK,MAAM,CACrC,QAAQ,gBAAgB,KAAK,IAAI,CACjC,QAAQ,oBAAoB,QAAQ,CACpC,QAAQ,wBAAwB,KAAK,eAAe,GAAG,CACvD,QAAQ,oBAAoB,KAAK,WAAW,GAAG,CAC/C,QAAQ,mBAAmB,OAAO;;;;;AAMvC,eAAsB,QACpB,MACA,UACA,QACA,gBACY;CAEZ,MAAMC,WAAyC,EAAE;AAEjD,MAAK,MAAM,CAAC,KAAK,SAAS,OAAO,QAAQ,OAAO,EAAE;EAChD,MAAM,aAAa,KAAK,SAAS,IAAI;EACrC,MAAM,WAAW,aAAa,KAAK,MAAM,GAAG,GAAG,GAAG;EAElD,IAAIC;AACJ,UAAQ,UAAR;GACE,KAAK;AACH,cAAUC,MAAE,QAAQ;AACpB;GACF,KAAK;AACH,cAAUA,MAAE,QAAQ;AACpB;GACF,KAAK;AACH,cAAUA,MAAE,SAAS;AACrB;GACF,KAAK;AACH,cAAUA,MAAE,MAAMA,MAAE,QAAQ,CAAC;AAC7B;GACF,KAAK;AACH,cAAUA,MAAE,MAAMA,MAAE,QAAQ,CAAC;AAC7B;GACF,QACE,WAAUA,MAAE,QAAQ;;AAGxB,WAAS,OAAO,aAAa,QAAQ,UAAU,GAAG;;CAGpD,MAAM,YAAYA,MAAE,OAAO,SAAS;CAEpC,MAAM,UAAU,KAAK,YAAY,MAAM,GAAG,IAAK;CAE/C,IAAIC;AAEJ,KAAI,gBAAgB;AAElB,WAAS,kBAAkB,gBAAgB,MAAM,QAAQ;AAGzD,MAAI,CAAC,eAAe,SAAS,cAAc,CACzC,WAAU,iBAAiB;OAG7B,UAAS;;SAEJ,KAAK,MAAM;OACb,KAAK,IAAI;;;EAGd,QAAQ;;;EAGR,OAAO,QAAQ,OAAO,CACrB,KAAK,CAAC,KAAK,UAAU,KAAK,IAAI,IAAI,KAAK,GAAG,CAC1C,KAAK,KAAK;AAGX,QAAO,SAAS,aAAgB,QAAQ,UAA0B;;;;;AAMpE,eAAe,UAAU,SAAiB,UAAwC;CAChF,MAAM,SAAS;;EAEf;AAGA,SADe,MAAM,SAAS,aAAa,QAAQ,cAAc,EACnD;;;;;AAMhB,eAAe,YAAY,SAAiB,UAA0C;CACpF,MAAM,SAAS;;EAEf;AAGA,SADe,MAAM,SAAS,aAAa,QAAQ,WAAW,EAChD;;;;;AAMhB,eAAe,gBAAgB,SAAiB,UAAmD;CACjG,MAAM,SAAS;;EAEf;AAEA,QAAO,SAAS,aAAa,QAAQ,eAAe;;;;;AAMtD,eAAe,SACb,SACA,UACsD;CACtD,MAAM,SAAS;;;;;;;;;EASf;AAEA,QAAO,SAAS,aAAa,QAAQ,eAAe"}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"index.cjs","names":["sections: MarkdownSection[]","allLinks: MarkdownLink[]","codeBlocks: CodeBlock[]","frontmatter: Record<string, unknown> | undefined","currentSection: MarkdownSection | null","linkData: MarkdownLink","result: Record<string, unknown>","value: string | boolean | number","links: MarkdownLink[]"],"sources":["../../src/parsers/github.ts","../../src/parsers/markdown.ts"],"sourcesContent":["import type { GitHubMeta, MarkdownLink } from './types.js';\n\n/**\n * GitHub-specific utilities for parsing repositories.\n */\n\n/**\n * Check if a URL is a GitHub repository\n */\nexport function isGitHubRepo(url: string): boolean {\n return /^https?:\\/\\/(www\\.)?github\\.com\\/[^/]+\\/[^/]+\\/?$/.test(url);\n}\n\n/**\n * Extract GitHub repo info from URL\n */\nexport function parseGitHubUrl(url: string): { owner: string; repo: string } | null {\n const match = url.match(/github\\.com\\/([^/]+)\\/([^/]+)/);\n if (!match || !match[1] || !match[2]) return null;\n return {\n owner: match[1],\n repo: match[2].replace(/\\.git$/, ''),\n };\n}\n\n/**\n * Convert a GitHub repo URL to raw content URL\n */\nexport function toRawUrl(url: string, branch = 'main', file = 'README.md'): string {\n const info = parseGitHubUrl(url);\n if (!info) return url;\n return `https://raw.githubusercontent.com/${info.owner}/${info.repo}/${branch}/${file}`;\n}\n\n/**\n * Fetch GitHub API metadata for a repository\n * Note: This is a placeholder - actual implementation would need GitHub API access\n */\nexport async function fetchRepoMeta(\n owner: string,\n repo: string,\n _token?: string\n): Promise<GitHubMeta> {\n // This would make actual API calls in a full implementation\n // For now, return basic info\n return {\n repoOwner: owner,\n repoName: repo,\n };\n}\n\n/**\n * Group links by their category/section\n */\nexport function groupByCategory(links: MarkdownLink[]): Map<string, MarkdownLink[]> {\n const groups = new Map<string, MarkdownLink[]>();\n\n for (const link of links) {\n const category = link.context || 'Uncategorized';\n const existing = groups.get(category) || [];\n existing.push(link);\n groups.set(category, existing);\n }\n\n return groups;\n}\n","import type { Code, Heading, Link, ListItem, Root } from 'mdast';\nimport { fromMarkdown } from 'mdast-util-from-markdown';\nimport { toString as mdastToString } from 'mdast-util-to-string';\nimport { visit } from 'unist-util-visit';\nimport type {\n CodeBlock,\n MarkdownLink,\n MarkdownSection,\n ParsedMarkdown,\n ParserResult,\n SourceParser,\n} from './types.js';\n\n/**\n * Generic Markdown parser.\n * Extracts structure, links, and code blocks from markdown content.\n *\n * @example\n * ```ts\n * const parser = new MarkdownParser();\n * const result = parser.parse(markdownContent);\n * console.log(result.data.sections);\n * console.log(result.data.links);\n * ```\n */\nexport class MarkdownParser implements SourceParser<ParsedMarkdown> {\n readonly name = 'markdown';\n\n canParse(content: string): boolean {\n // Check for common markdown patterns\n return (\n content.includes('# ') ||\n content.includes('## ') ||\n content.includes('- [') ||\n content.includes('* [') ||\n content.includes('```')\n );\n }\n\n parse(content: string): ParserResult<ParsedMarkdown> {\n const tree = fromMarkdown(content);\n const sections: MarkdownSection[] = [];\n const allLinks: MarkdownLink[] = [];\n const codeBlocks: CodeBlock[] = [];\n let frontmatter: Record<string, unknown> | undefined;\n\n // Extract frontmatter if present\n if (content.startsWith('---')) {\n const endIndex = content.indexOf('---', 3);\n if (endIndex !== -1) {\n const frontmatterContent = content.slice(3, endIndex).trim();\n frontmatter = this.parseFrontmatter(frontmatterContent);\n }\n }\n\n // Track current section\n let currentSection: MarkdownSection | null = null;\n\n // Process the AST\n visit(tree, (node) => {\n // Handle headings\n if (node.type === 'heading') {\n const heading = node as Heading;\n const title = mdastToString(heading);\n\n // Finalize previous section\n if (currentSection) {\n sections.push(currentSection);\n }\n\n currentSection = {\n level: heading.depth,\n title,\n content: '',\n links: [],\n };\n }\n\n // Handle links\n if (node.type === 'link') {\n const link = node as Link;\n const text = mdastToString(link);\n const linkData: MarkdownLink = {\n url: link.url,\n text,\n title: link.title ?? undefined,\n context: currentSection?.title,\n };\n\n allLinks.push(linkData);\n if (currentSection) {\n currentSection.links.push(linkData);\n }\n }\n\n // Handle code blocks\n if (node.type === 'code') {\n const code = node as Code;\n codeBlocks.push({\n language: code.lang ?? undefined,\n code: code.value,\n meta: code.meta ?? undefined,\n });\n }\n\n // Accumulate content for current section\n if (currentSection && node.type === 'paragraph') {\n const text = mdastToString(node);\n currentSection.content += (currentSection.content ? '\\n\\n' : '') + text;\n }\n });\n\n // Finalize last section\n if (currentSection) {\n sections.push(currentSection);\n }\n\n // Extract title from first h1 or frontmatter\n const title = (frontmatter?.title as string) ?? sections.find((s) => s.level === 1)?.title;\n\n // Extract description from frontmatter or first paragraph before any heading\n const description = (frontmatter?.description as string) ?? this.extractDescription(tree);\n\n return {\n data: {\n title,\n description,\n sections,\n links: allLinks,\n codeBlocks,\n frontmatter,\n },\n };\n }\n\n private parseFrontmatter(content: string): Record<string, unknown> {\n const result: Record<string, unknown> = {};\n const lines = content.split('\\n');\n\n for (const line of lines) {\n const colonIndex = line.indexOf(':');\n if (colonIndex > 0) {\n const key = line.slice(0, colonIndex).trim();\n let value: string | boolean | number = line.slice(colonIndex + 1).trim();\n\n // Parse simple types\n if (value === 'true') value = true;\n else if (value === 'false') value = false;\n else if (/^-?\\d+(\\.\\d+)?$/.test(value)) value = Number(value);\n else if (value.startsWith('\"') && value.endsWith('\"')) value = value.slice(1, -1);\n else if (value.startsWith(\"'\") && value.endsWith(\"'\")) value = value.slice(1, -1);\n\n result[key] = value;\n }\n }\n\n return result;\n }\n\n private extractDescription(tree: Root): string | undefined {\n // Find first paragraph before any heading\n for (const node of tree.children) {\n if (node.type === 'heading') break;\n if (node.type === 'paragraph') {\n return mdastToString(node);\n }\n }\n return undefined;\n }\n}\n\n/**\n * Extract links from a list-based markdown structure (like awesome lists)\n */\nexport function extractListLinks(markdown: string): MarkdownLink[] {\n const tree = fromMarkdown(markdown);\n const links: MarkdownLink[] = [];\n let currentHeading = '';\n\n visit(tree, (node) => {\n if (node.type === 'heading') {\n currentHeading = mdastToString(node as Heading);\n }\n\n if (node.type === 'listItem') {\n const listItem = node as ListItem;\n\n // Find links in this list item\n visit(listItem, 'link', (linkNode: Link) => {\n links.push({\n url: linkNode.url,\n text: mdastToString(linkNode),\n title: linkNode.title ?? undefined,\n context: currentHeading || undefined,\n });\n });\n }\n });\n\n return links;\n}\n\n/**\n * Parse markdown into sections by heading level\n */\nexport function parseByHeadings(markdown: string, minLevel = 2): MarkdownSection[] {\n const parser = new MarkdownParser();\n const result = parser.parse(markdown);\n return result.data.sections.filter((s) => s.level >= minLevel);\n}\n"],"mappings":";;;;;;;;;;;;AASA,SAAgB,aAAa,KAAsB;AACjD,QAAO,oDAAoD,KAAK,IAAI;;;;;AAMtE,SAAgB,eAAe,KAAqD;CAClF,MAAM,QAAQ,IAAI,MAAM,gCAAgC;AACxD,KAAI,CAAC,SAAS,CAAC,MAAM,MAAM,CAAC,MAAM,GAAI,QAAO;AAC7C,QAAO;EACL,OAAO,MAAM;EACb,MAAM,MAAM,GAAG,QAAQ,UAAU,GAAG;EACrC;;;;;AAMH,SAAgB,SAAS,KAAa,SAAS,QAAQ,OAAO,aAAqB;CACjF,MAAM,OAAO,eAAe,IAAI;AAChC,KAAI,CAAC,KAAM,QAAO;AAClB,QAAO,qCAAqC,KAAK,MAAM,GAAG,KAAK,KAAK,GAAG,OAAO,GAAG;;;;;;AAOnF,eAAsB,cACpB,OACA,MACA,QACqB;AAGrB,QAAO;EACL,WAAW;EACX,UAAU;EACX;;;;;AAMH,SAAgB,gBAAgB,OAAoD;CAClF,MAAM,yBAAS,IAAI,KAA6B;AAEhD,MAAK,MAAM,QAAQ,OAAO;EACxB,MAAM,WAAW,KAAK,WAAW;EACjC,MAAM,WAAW,OAAO,IAAI,SAAS,IAAI,EAAE;AAC3C,WAAS,KAAK,KAAK;AACnB,SAAO,IAAI,UAAU,SAAS;;AAGhC,QAAO;;;;;;;;;;;;;;;;;ACvCT,IAAa,iBAAb,MAAoE;CAClE,AAAS,OAAO;CAEhB,SAAS,SAA0B;AAEjC,SACE,QAAQ,SAAS,KAAK,IACtB,QAAQ,SAAS,MAAM,IACvB,QAAQ,SAAS,MAAM,IACvB,QAAQ,SAAS,MAAM,IACvB,QAAQ,SAAS,MAAM;;CAI3B,MAAM,SAA+C;EACnD,MAAM,kDAAoB,QAAQ;EAClC,MAAMA,WAA8B,EAAE;EACtC,MAAMC,WAA2B,EAAE;EACnC,MAAMC,aAA0B,EAAE;EAClC,IAAIC;AAGJ,MAAI,QAAQ,WAAW,MAAM,EAAE;GAC7B,MAAM,WAAW,QAAQ,QAAQ,OAAO,EAAE;AAC1C,OAAI,aAAa,IAAI;IACnB,MAAM,qBAAqB,QAAQ,MAAM,GAAG,SAAS,CAAC,MAAM;AAC5D,kBAAc,KAAK,iBAAiB,mBAAmB;;;EAK3D,IAAIC,iBAAyC;AAG7C,8BAAM,OAAO,SAAS;AAEpB,OAAI,KAAK,SAAS,WAAW;IAC3B,MAAM,UAAU;IAChB,MAAM,2CAAsB,QAAQ;AAGpC,QAAI,eACF,UAAS,KAAK,eAAe;AAG/B,qBAAiB;KACf,OAAO,QAAQ;KACf;KACA,SAAS;KACT,OAAO,EAAE;KACV;;AAIH,OAAI,KAAK,SAAS,QAAQ;IACxB,MAAM,OAAO;IACb,MAAM,0CAAqB,KAAK;IAChC,MAAMC,WAAyB;KAC7B,KAAK,KAAK;KACV;KACA,OAAO,KAAK,SAAS;KACrB,SAAS,gBAAgB;KAC1B;AAED,aAAS,KAAK,SAAS;AACvB,QAAI,eACF,gBAAe,MAAM,KAAK,SAAS;;AAKvC,OAAI,KAAK,SAAS,QAAQ;IACxB,MAAM,OAAO;AACb,eAAW,KAAK;KACd,UAAU,KAAK,QAAQ;KACvB,MAAM,KAAK;KACX,MAAM,KAAK,QAAQ;KACpB,CAAC;;AAIJ,OAAI,kBAAkB,KAAK,SAAS,aAAa;IAC/C,MAAM,0CAAqB,KAAK;AAChC,mBAAe,YAAY,eAAe,UAAU,SAAS,MAAM;;IAErE;AAGF,MAAI,eACF,UAAS,KAAK,eAAe;AAS/B,SAAO,EACL,MAAM;GACJ,OAPW,aAAa,SAAoB,SAAS,MAAM,MAAM,EAAE,UAAU,EAAE,EAAE;GAQjF,aALiB,aAAa,eAA0B,KAAK,mBAAmB,KAAK;GAMrF;GACA,OAAO;GACP;GACA;GACD,EACF;;CAGH,AAAQ,iBAAiB,SAA0C;EACjE,MAAMC,SAAkC,EAAE;EAC1C,MAAM,QAAQ,QAAQ,MAAM,KAAK;AAEjC,OAAK,MAAM,QAAQ,OAAO;GACxB,MAAM,aAAa,KAAK,QAAQ,IAAI;AACpC,OAAI,aAAa,GAAG;IAClB,MAAM,MAAM,KAAK,MAAM,GAAG,WAAW,CAAC,MAAM;IAC5C,IAAIC,QAAmC,KAAK,MAAM,aAAa,EAAE,CAAC,MAAM;AAGxE,QAAI,UAAU,OAAQ,SAAQ;aACrB,UAAU,QAAS,SAAQ;aAC3B,kBAAkB,KAAK,MAAM,CAAE,SAAQ,OAAO,MAAM;aACpD,MAAM,WAAW,KAAI,IAAI,MAAM,SAAS,KAAI,CAAE,SAAQ,MAAM,MAAM,GAAG,GAAG;aACxE,MAAM,WAAW,IAAI,IAAI,MAAM,SAAS,IAAI,CAAE,SAAQ,MAAM,MAAM,GAAG,GAAG;AAEjF,WAAO,OAAO;;;AAIlB,SAAO;;CAGT,AAAQ,mBAAmB,MAAgC;AAEzD,OAAK,MAAM,QAAQ,KAAK,UAAU;AAChC,OAAI,KAAK,SAAS,UAAW;AAC7B,OAAI,KAAK,SAAS,YAChB,2CAAqB,KAAK;;;;;;;AAUlC,SAAgB,iBAAiB,UAAkC;CACjE,MAAM,kDAAoB,SAAS;CACnC,MAAMC,QAAwB,EAAE;CAChC,IAAI,iBAAiB;AAErB,6BAAM,OAAO,SAAS;AACpB,MAAI,KAAK,SAAS,UAChB,qDAA+B,KAAgB;AAGjD,MAAI,KAAK,SAAS,WAIhB,6BAHiB,MAGD,SAAS,aAAmB;AAC1C,SAAM,KAAK;IACT,KAAK,SAAS;IACd,yCAAoB,SAAS;IAC7B,OAAO,SAAS,SAAS;IACzB,SAAS,kBAAkB;IAC5B,CAAC;IACF;GAEJ;AAEF,QAAO;;;;;AAMT,SAAgB,gBAAgB,UAAkB,WAAW,GAAsB;AAGjF,QAFe,IAAI,gBAAgB,CACb,MAAM,SAAS,CACvB,KAAK,SAAS,QAAQ,MAAM,EAAE,SAAS,SAAS"}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.cts","names":[],"sources":["../../src/parsers/types.ts","../../src/parsers/github.ts","../../src/parsers/markdown.ts"],"sourcesContent":[],"mappings":";;AAOA;;;;;AAiBA;AAQiB,UAzBA,YAyBY,CAAA,KAAA,EAAA,QAAA,OAAA,CAAA,CAAA;EAUZ,SAAA,IAAA,EAAA,MAAe;EAUf;;;EAKH,QAAA,CAAA,OAAA,EAAA,MAAA,EAAA,GAAA,CAAA,EAAA,MAAA,CAAA,EAAA,OAAA;EACE;;AAMhB;EASiB,KAAA,CAAA,OAAU,EAAA,MAAA,EAAA,GAAA,CAAA,EAAA,MAAA,CAAA,EAvDa,YAuDb,CAvD0B,KAuD1B,EAvDiC,KAuDjC,CAAA;;;;AChE3B;AAOgB,UDQC,YCRa,CAAA,KAAA,EAAA,QAAA,OAAA,CAAA,CAAA;EAYd,IAAA,EDHR,KCGgB;EAUF,IAAA,CAAA,EDZb,KCYa;AAgBtB;;;;AAA2D,UDtB1C,YAAA,CCsB0C;;;;EC7B9C,OAAA,CAAA,EAAA,MAAA;;;;;AAAsC,UFiBlC,eAAA,CEjBkC;EAqJnC,KAAA,EAAA,MAAA;EA+BA,KAAA,EAAA,MAAA;;SF/JP;;;;;UAMQ,cAAA;;;YAGL;SACH;cACK;gBACE;;;;;UAMC,SAAA;;;;;;;;UASA,UAAA;;;;;;;;AAlEjB;;;;;AAiBA;AAQiB,iBCvBD,YAAA,CDuBa,GAAA,EAAA,MAAA,CAAA,EAAA,OAAA;AAU7B;AAUA;;AAIS,iBCxCO,cAAA,CDwCP,GAAA,EAAA,MAAA,CAAA,EAAA;EACK,KAAA,EAAA,MAAA;EACE,IAAA,EAAA,MAAA;CAAM,GAAA,IAAA;AAMtB;AASA;;iBC7CgB,QAAA;;AAnBhB;AAOA;AAYA;AAUsB,iBAAA,aAAA,CAIX,KAAA,EAAR,MAAA,EAAO,IAAA,EAAA,MAAA,EAAA,MAAA,CAAA,EAAA,MAAA,CAAA,EAAP,OAAO,CAAC,UAAD,CAAA;AAYV;;;AAAwD,iBAAxC,eAAA,CAAwC,KAAA,EAAjB,YAAiB,EAAA,CAAA,EAAA,GAAA,CAAA,MAAA,EAAY,YAAZ,EAAA,CAAA;;;AD/CxD;;;;;AAiBA;AAQA;AAUA;AAUA;;;;AAMgB,cEjCH,cAAA,YAA0B,YFiCvB,CEjCoC,cFiCpC,CAAA,CAAA;EAAM,SAAA,IAAA,GAAA,UAAA;EAML,QAAA,CAAA,OAAS,EAAA,MAAA,CAAA,EAAA,OAAA;EAST,KAAA,CAAA,OAAU,EAAA,MAAA,CAAA,EElCD,YFkCC,CElCY,cFkCZ,CAAA;;;;AChE3B;AAOA;AAYA;AAUsB,iBCwIN,gBAAA,CDpIL,QAAR,EAAA,MAAO,CAAA,ECoI0C,YDpI1C,EAAA;AAYV;;;AAAwD,iBCuJxC,eAAA,CDvJwC,QAAA,EAAA,MAAA,EAAA,QAAA,CAAA,EAAA,MAAA,CAAA,ECuJS,eDvJT,EAAA"}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.mts","names":[],"sources":["../../src/parsers/types.ts","../../src/parsers/github.ts","../../src/parsers/markdown.ts"],"sourcesContent":[],"mappings":";;AAOA;;;;;AAiBA;AAQiB,UAzBA,YAyBY,CAAA,KAAA,EAAA,QAAA,OAAA,CAAA,CAAA;EAUZ,SAAA,IAAA,EAAA,MAAe;EAUf;;;EAKH,QAAA,CAAA,OAAA,EAAA,MAAA,EAAA,GAAA,CAAA,EAAA,MAAA,CAAA,EAAA,OAAA;EACE;;AAMhB;EASiB,KAAA,CAAA,OAAU,EAAA,MAAA,EAAA,GAAA,CAAA,EAAA,MAAA,CAAA,EAvDa,YAuDb,CAvD0B,KAuD1B,EAvDiC,KAuDjC,CAAA;;;;AChE3B;AAOgB,UDQC,YCRa,CAAA,KAAA,EAAA,QAAA,OAAA,CAAA,CAAA;EAYd,IAAA,EDHR,KCGgB;EAUF,IAAA,CAAA,EDZb,KCYa;AAgBtB;;;;AAA2D,UDtB1C,YAAA,CCsB0C;;;;EC7B9C,OAAA,CAAA,EAAA,MAAA;;;;;AAAsC,UFiBlC,eAAA,CEjBkC;EAqJnC,KAAA,EAAA,MAAA;EA+BA,KAAA,EAAA,MAAA;;SF/JP;;;;;UAMQ,cAAA;;;YAGL;SACH;cACK;gBACE;;;;;UAMC,SAAA;;;;;;;;UASA,UAAA;;;;;;;;AAlEjB;;;;;AAiBA;AAQiB,iBCvBD,YAAA,CDuBa,GAAA,EAAA,MAAA,CAAA,EAAA,OAAA;AAU7B;AAUA;;AAIS,iBCxCO,cAAA,CDwCP,GAAA,EAAA,MAAA,CAAA,EAAA;EACK,KAAA,EAAA,MAAA;EACE,IAAA,EAAA,MAAA;CAAM,GAAA,IAAA;AAMtB;AASA;;iBC7CgB,QAAA;;AAnBhB;AAOA;AAYA;AAUsB,iBAAA,aAAA,CAIX,KAAA,EAAR,MAAA,EAAO,IAAA,EAAA,MAAA,EAAA,MAAA,CAAA,EAAA,MAAA,CAAA,EAAP,OAAO,CAAC,UAAD,CAAA;AAYV;;;AAAwD,iBAAxC,eAAA,CAAwC,KAAA,EAAjB,YAAiB,EAAA,CAAA,EAAA,GAAA,CAAA,MAAA,EAAY,YAAZ,EAAA,CAAA;;;AD/CxD;;;;;AAiBA;AAQA;AAUA;AAUA;;;;AAMgB,cEjCH,cAAA,YAA0B,YFiCvB,CEjCoC,cFiCpC,CAAA,CAAA;EAAM,SAAA,IAAA,GAAA,UAAA;EAML,QAAA,CAAA,OAAS,EAAA,MAAA,CAAA,EAAA,OAAA;EAST,KAAA,CAAA,OAAU,EAAA,MAAA,CAAA,EElCD,YFkCC,CElCY,cFkCZ,CAAA;;;;AChE3B;AAOA;AAYA;AAUsB,iBCwIN,gBAAA,CDpIL,QAAR,EAAA,MAAO,CAAA,ECoI0C,YDpI1C,EAAA;AAYV;;;AAAwD,iBCuJxC,eAAA,CDvJwC,QAAA,EAAA,MAAA,EAAA,QAAA,CAAA,EAAA,MAAA,CAAA,ECuJS,eDvJT,EAAA"}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"index.mjs","names":["sections: MarkdownSection[]","allLinks: MarkdownLink[]","codeBlocks: CodeBlock[]","frontmatter: Record<string, unknown> | undefined","currentSection: MarkdownSection | null","mdastToString","linkData: MarkdownLink","result: Record<string, unknown>","value: string | boolean | number","links: MarkdownLink[]"],"sources":["../../src/parsers/github.ts","../../src/parsers/markdown.ts"],"sourcesContent":["import type { GitHubMeta, MarkdownLink } from './types.js';\n\n/**\n * GitHub-specific utilities for parsing repositories.\n */\n\n/**\n * Check if a URL is a GitHub repository\n */\nexport function isGitHubRepo(url: string): boolean {\n return /^https?:\\/\\/(www\\.)?github\\.com\\/[^/]+\\/[^/]+\\/?$/.test(url);\n}\n\n/**\n * Extract GitHub repo info from URL\n */\nexport function parseGitHubUrl(url: string): { owner: string; repo: string } | null {\n const match = url.match(/github\\.com\\/([^/]+)\\/([^/]+)/);\n if (!match || !match[1] || !match[2]) return null;\n return {\n owner: match[1],\n repo: match[2].replace(/\\.git$/, ''),\n };\n}\n\n/**\n * Convert a GitHub repo URL to raw content URL\n */\nexport function toRawUrl(url: string, branch = 'main', file = 'README.md'): string {\n const info = parseGitHubUrl(url);\n if (!info) return url;\n return `https://raw.githubusercontent.com/${info.owner}/${info.repo}/${branch}/${file}`;\n}\n\n/**\n * Fetch GitHub API metadata for a repository\n * Note: This is a placeholder - actual implementation would need GitHub API access\n */\nexport async function fetchRepoMeta(\n owner: string,\n repo: string,\n _token?: string\n): Promise<GitHubMeta> {\n // This would make actual API calls in a full implementation\n // For now, return basic info\n return {\n repoOwner: owner,\n repoName: repo,\n };\n}\n\n/**\n * Group links by their category/section\n */\nexport function groupByCategory(links: MarkdownLink[]): Map<string, MarkdownLink[]> {\n const groups = new Map<string, MarkdownLink[]>();\n\n for (const link of links) {\n const category = link.context || 'Uncategorized';\n const existing = groups.get(category) || [];\n existing.push(link);\n groups.set(category, existing);\n }\n\n return groups;\n}\n","import type { Code, Heading, Link, ListItem, Root } from 'mdast';\nimport { fromMarkdown } from 'mdast-util-from-markdown';\nimport { toString as mdastToString } from 'mdast-util-to-string';\nimport { visit } from 'unist-util-visit';\nimport type {\n CodeBlock,\n MarkdownLink,\n MarkdownSection,\n ParsedMarkdown,\n ParserResult,\n SourceParser,\n} from './types.js';\n\n/**\n * Generic Markdown parser.\n * Extracts structure, links, and code blocks from markdown content.\n *\n * @example\n * ```ts\n * const parser = new MarkdownParser();\n * const result = parser.parse(markdownContent);\n * console.log(result.data.sections);\n * console.log(result.data.links);\n * ```\n */\nexport class MarkdownParser implements SourceParser<ParsedMarkdown> {\n readonly name = 'markdown';\n\n canParse(content: string): boolean {\n // Check for common markdown patterns\n return (\n content.includes('# ') ||\n content.includes('## ') ||\n content.includes('- [') ||\n content.includes('* [') ||\n content.includes('```')\n );\n }\n\n parse(content: string): ParserResult<ParsedMarkdown> {\n const tree = fromMarkdown(content);\n const sections: MarkdownSection[] = [];\n const allLinks: MarkdownLink[] = [];\n const codeBlocks: CodeBlock[] = [];\n let frontmatter: Record<string, unknown> | undefined;\n\n // Extract frontmatter if present\n if (content.startsWith('---')) {\n const endIndex = content.indexOf('---', 3);\n if (endIndex !== -1) {\n const frontmatterContent = content.slice(3, endIndex).trim();\n frontmatter = this.parseFrontmatter(frontmatterContent);\n }\n }\n\n // Track current section\n let currentSection: MarkdownSection | null = null;\n\n // Process the AST\n visit(tree, (node) => {\n // Handle headings\n if (node.type === 'heading') {\n const heading = node as Heading;\n const title = mdastToString(heading);\n\n // Finalize previous section\n if (currentSection) {\n sections.push(currentSection);\n }\n\n currentSection = {\n level: heading.depth,\n title,\n content: '',\n links: [],\n };\n }\n\n // Handle links\n if (node.type === 'link') {\n const link = node as Link;\n const text = mdastToString(link);\n const linkData: MarkdownLink = {\n url: link.url,\n text,\n title: link.title ?? undefined,\n context: currentSection?.title,\n };\n\n allLinks.push(linkData);\n if (currentSection) {\n currentSection.links.push(linkData);\n }\n }\n\n // Handle code blocks\n if (node.type === 'code') {\n const code = node as Code;\n codeBlocks.push({\n language: code.lang ?? undefined,\n code: code.value,\n meta: code.meta ?? undefined,\n });\n }\n\n // Accumulate content for current section\n if (currentSection && node.type === 'paragraph') {\n const text = mdastToString(node);\n currentSection.content += (currentSection.content ? '\\n\\n' : '') + text;\n }\n });\n\n // Finalize last section\n if (currentSection) {\n sections.push(currentSection);\n }\n\n // Extract title from first h1 or frontmatter\n const title = (frontmatter?.title as string) ?? sections.find((s) => s.level === 1)?.title;\n\n // Extract description from frontmatter or first paragraph before any heading\n const description = (frontmatter?.description as string) ?? this.extractDescription(tree);\n\n return {\n data: {\n title,\n description,\n sections,\n links: allLinks,\n codeBlocks,\n frontmatter,\n },\n };\n }\n\n private parseFrontmatter(content: string): Record<string, unknown> {\n const result: Record<string, unknown> = {};\n const lines = content.split('\\n');\n\n for (const line of lines) {\n const colonIndex = line.indexOf(':');\n if (colonIndex > 0) {\n const key = line.slice(0, colonIndex).trim();\n let value: string | boolean | number = line.slice(colonIndex + 1).trim();\n\n // Parse simple types\n if (value === 'true') value = true;\n else if (value === 'false') value = false;\n else if (/^-?\\d+(\\.\\d+)?$/.test(value)) value = Number(value);\n else if (value.startsWith('\"') && value.endsWith('\"')) value = value.slice(1, -1);\n else if (value.startsWith(\"'\") && value.endsWith(\"'\")) value = value.slice(1, -1);\n\n result[key] = value;\n }\n }\n\n return result;\n }\n\n private extractDescription(tree: Root): string | undefined {\n // Find first paragraph before any heading\n for (const node of tree.children) {\n if (node.type === 'heading') break;\n if (node.type === 'paragraph') {\n return mdastToString(node);\n }\n }\n return undefined;\n }\n}\n\n/**\n * Extract links from a list-based markdown structure (like awesome lists)\n */\nexport function extractListLinks(markdown: string): MarkdownLink[] {\n const tree = fromMarkdown(markdown);\n const links: MarkdownLink[] = [];\n let currentHeading = '';\n\n visit(tree, (node) => {\n if (node.type === 'heading') {\n currentHeading = mdastToString(node as Heading);\n }\n\n if (node.type === 'listItem') {\n const listItem = node as ListItem;\n\n // Find links in this list item\n visit(listItem, 'link', (linkNode: Link) => {\n links.push({\n url: linkNode.url,\n text: mdastToString(linkNode),\n title: linkNode.title ?? undefined,\n context: currentHeading || undefined,\n });\n });\n }\n });\n\n return links;\n}\n\n/**\n * Parse markdown into sections by heading level\n */\nexport function parseByHeadings(markdown: string, minLevel = 2): MarkdownSection[] {\n const parser = new MarkdownParser();\n const result = parser.parse(markdown);\n return result.data.sections.filter((s) => s.level >= minLevel);\n}\n"],"mappings":";;;;;;;;;;;AASA,SAAgB,aAAa,KAAsB;AACjD,QAAO,oDAAoD,KAAK,IAAI;;;;;AAMtE,SAAgB,eAAe,KAAqD;CAClF,MAAM,QAAQ,IAAI,MAAM,gCAAgC;AACxD,KAAI,CAAC,SAAS,CAAC,MAAM,MAAM,CAAC,MAAM,GAAI,QAAO;AAC7C,QAAO;EACL,OAAO,MAAM;EACb,MAAM,MAAM,GAAG,QAAQ,UAAU,GAAG;EACrC;;;;;AAMH,SAAgB,SAAS,KAAa,SAAS,QAAQ,OAAO,aAAqB;CACjF,MAAM,OAAO,eAAe,IAAI;AAChC,KAAI,CAAC,KAAM,QAAO;AAClB,QAAO,qCAAqC,KAAK,MAAM,GAAG,KAAK,KAAK,GAAG,OAAO,GAAG;;;;;;AAOnF,eAAsB,cACpB,OACA,MACA,QACqB;AAGrB,QAAO;EACL,WAAW;EACX,UAAU;EACX;;;;;AAMH,SAAgB,gBAAgB,OAAoD;CAClF,MAAM,yBAAS,IAAI,KAA6B;AAEhD,MAAK,MAAM,QAAQ,OAAO;EACxB,MAAM,WAAW,KAAK,WAAW;EACjC,MAAM,WAAW,OAAO,IAAI,SAAS,IAAI,EAAE;AAC3C,WAAS,KAAK,KAAK;AACnB,SAAO,IAAI,UAAU,SAAS;;AAGhC,QAAO;;;;;;;;;;;;;;;;;ACvCT,IAAa,iBAAb,MAAoE;CAClE,AAAS,OAAO;CAEhB,SAAS,SAA0B;AAEjC,SACE,QAAQ,SAAS,KAAK,IACtB,QAAQ,SAAS,MAAM,IACvB,QAAQ,SAAS,MAAM,IACvB,QAAQ,SAAS,MAAM,IACvB,QAAQ,SAAS,MAAM;;CAI3B,MAAM,SAA+C;EACnD,MAAM,OAAO,aAAa,QAAQ;EAClC,MAAMA,WAA8B,EAAE;EACtC,MAAMC,WAA2B,EAAE;EACnC,MAAMC,aAA0B,EAAE;EAClC,IAAIC;AAGJ,MAAI,QAAQ,WAAW,MAAM,EAAE;GAC7B,MAAM,WAAW,QAAQ,QAAQ,OAAO,EAAE;AAC1C,OAAI,aAAa,IAAI;IACnB,MAAM,qBAAqB,QAAQ,MAAM,GAAG,SAAS,CAAC,MAAM;AAC5D,kBAAc,KAAK,iBAAiB,mBAAmB;;;EAK3D,IAAIC,iBAAyC;AAG7C,QAAM,OAAO,SAAS;AAEpB,OAAI,KAAK,SAAS,WAAW;IAC3B,MAAM,UAAU;IAChB,MAAM,QAAQC,SAAc,QAAQ;AAGpC,QAAI,eACF,UAAS,KAAK,eAAe;AAG/B,qBAAiB;KACf,OAAO,QAAQ;KACf;KACA,SAAS;KACT,OAAO,EAAE;KACV;;AAIH,OAAI,KAAK,SAAS,QAAQ;IACxB,MAAM,OAAO;IACb,MAAM,OAAOA,SAAc,KAAK;IAChC,MAAMC,WAAyB;KAC7B,KAAK,KAAK;KACV;KACA,OAAO,KAAK,SAAS;KACrB,SAAS,gBAAgB;KAC1B;AAED,aAAS,KAAK,SAAS;AACvB,QAAI,eACF,gBAAe,MAAM,KAAK,SAAS;;AAKvC,OAAI,KAAK,SAAS,QAAQ;IACxB,MAAM,OAAO;AACb,eAAW,KAAK;KACd,UAAU,KAAK,QAAQ;KACvB,MAAM,KAAK;KACX,MAAM,KAAK,QAAQ;KACpB,CAAC;;AAIJ,OAAI,kBAAkB,KAAK,SAAS,aAAa;IAC/C,MAAM,OAAOD,SAAc,KAAK;AAChC,mBAAe,YAAY,eAAe,UAAU,SAAS,MAAM;;IAErE;AAGF,MAAI,eACF,UAAS,KAAK,eAAe;AAS/B,SAAO,EACL,MAAM;GACJ,OAPW,aAAa,SAAoB,SAAS,MAAM,MAAM,EAAE,UAAU,EAAE,EAAE;GAQjF,aALiB,aAAa,eAA0B,KAAK,mBAAmB,KAAK;GAMrF;GACA,OAAO;GACP;GACA;GACD,EACF;;CAGH,AAAQ,iBAAiB,SAA0C;EACjE,MAAME,SAAkC,EAAE;EAC1C,MAAM,QAAQ,QAAQ,MAAM,KAAK;AAEjC,OAAK,MAAM,QAAQ,OAAO;GACxB,MAAM,aAAa,KAAK,QAAQ,IAAI;AACpC,OAAI,aAAa,GAAG;IAClB,MAAM,MAAM,KAAK,MAAM,GAAG,WAAW,CAAC,MAAM;IAC5C,IAAIC,QAAmC,KAAK,MAAM,aAAa,EAAE,CAAC,MAAM;AAGxE,QAAI,UAAU,OAAQ,SAAQ;aACrB,UAAU,QAAS,SAAQ;aAC3B,kBAAkB,KAAK,MAAM,CAAE,SAAQ,OAAO,MAAM;aACpD,MAAM,WAAW,KAAI,IAAI,MAAM,SAAS,KAAI,CAAE,SAAQ,MAAM,MAAM,GAAG,GAAG;aACxE,MAAM,WAAW,IAAI,IAAI,MAAM,SAAS,IAAI,CAAE,SAAQ,MAAM,MAAM,GAAG,GAAG;AAEjF,WAAO,OAAO;;;AAIlB,SAAO;;CAGT,AAAQ,mBAAmB,MAAgC;AAEzD,OAAK,MAAM,QAAQ,KAAK,UAAU;AAChC,OAAI,KAAK,SAAS,UAAW;AAC7B,OAAI,KAAK,SAAS,YAChB,QAAOH,SAAc,KAAK;;;;;;;AAUlC,SAAgB,iBAAiB,UAAkC;CACjE,MAAM,OAAO,aAAa,SAAS;CACnC,MAAMI,QAAwB,EAAE;CAChC,IAAI,iBAAiB;AAErB,OAAM,OAAO,SAAS;AACpB,MAAI,KAAK,SAAS,UAChB,kBAAiBJ,SAAc,KAAgB;AAGjD,MAAI,KAAK,SAAS,WAIhB,OAHiB,MAGD,SAAS,aAAmB;AAC1C,SAAM,KAAK;IACT,KAAK,SAAS;IACd,MAAMA,SAAc,SAAS;IAC7B,OAAO,SAAS,SAAS;IACzB,SAAS,kBAAkB;IAC5B,CAAC;IACF;GAEJ;AAEF,QAAO;;;;;AAMT,SAAgB,gBAAgB,UAAkB,WAAW,GAAsB;AAGjF,QAFe,IAAI,gBAAgB,CACb,MAAM,SAAS,CACvB,KAAK,SAAS,QAAQ,MAAM,EAAE,SAAS,SAAS"}
|
|
@@ -1,150 +0,0 @@
|
|
|
1
|
-
import { CheerioAPI } from "cheerio";
|
|
2
|
-
|
|
3
|
-
//#region src/core/types.d.ts
|
|
4
|
-
|
|
5
|
-
/**
|
|
6
|
-
* Content type classification for scraped URLs
|
|
7
|
-
*/
|
|
8
|
-
type ContentType = 'article' | 'repo' | 'docs' | 'package' | 'video' | 'tool' | 'product' | 'unknown';
|
|
9
|
-
/**
|
|
10
|
-
* Extracted link from content
|
|
11
|
-
*/
|
|
12
|
-
interface ExtractedLink {
|
|
13
|
-
url: string;
|
|
14
|
-
text: string;
|
|
15
|
-
isExternal: boolean;
|
|
16
|
-
}
|
|
17
|
-
/**
|
|
18
|
-
* Extracted entities from LLM enhancement
|
|
19
|
-
*/
|
|
20
|
-
interface ExtractedEntities {
|
|
21
|
-
people: string[];
|
|
22
|
-
organizations: string[];
|
|
23
|
-
technologies: string[];
|
|
24
|
-
locations: string[];
|
|
25
|
-
concepts: string[];
|
|
26
|
-
}
|
|
27
|
-
/**
|
|
28
|
-
* Main result of metadata scraping - optimized for LLM consumption
|
|
29
|
-
*/
|
|
30
|
-
interface ScrapedData {
|
|
31
|
-
url: string;
|
|
32
|
-
canonicalUrl: string;
|
|
33
|
-
domain: string;
|
|
34
|
-
title: string;
|
|
35
|
-
description: string;
|
|
36
|
-
image?: string;
|
|
37
|
-
favicon?: string;
|
|
38
|
-
content: string;
|
|
39
|
-
textContent: string;
|
|
40
|
-
excerpt: string;
|
|
41
|
-
wordCount: number;
|
|
42
|
-
author?: string;
|
|
43
|
-
publishedAt?: string;
|
|
44
|
-
modifiedAt?: string;
|
|
45
|
-
siteName?: string;
|
|
46
|
-
language?: string;
|
|
47
|
-
contentType: ContentType;
|
|
48
|
-
keywords: string[];
|
|
49
|
-
jsonLd?: Record<string, unknown>[];
|
|
50
|
-
links?: ExtractedLink[];
|
|
51
|
-
summary?: string;
|
|
52
|
-
suggestedTags?: string[];
|
|
53
|
-
entities?: ExtractedEntities;
|
|
54
|
-
extracted?: Record<string, unknown>;
|
|
55
|
-
custom?: Record<string, unknown>;
|
|
56
|
-
scrapedAt: string;
|
|
57
|
-
scrapeTimeMs: number;
|
|
58
|
-
error?: string;
|
|
59
|
-
}
|
|
60
|
-
/**
|
|
61
|
-
* LLM enhancement types
|
|
62
|
-
*/
|
|
63
|
-
type EnhancementType = 'summarize' | 'tags' | 'entities' | 'classify';
|
|
64
|
-
/**
|
|
65
|
-
* Schema for structured LLM extraction
|
|
66
|
-
*/
|
|
67
|
-
type ExtractionSchemaType = 'string' | 'number' | 'boolean' | 'string[]' | 'number[]' | `${string}?`;
|
|
68
|
-
type ExtractionSchema = Record<string, ExtractionSchemaType>;
|
|
69
|
-
/**
|
|
70
|
-
* Forward declaration for LLM provider (defined in llm/types.ts)
|
|
71
|
-
*/
|
|
72
|
-
interface LLMProvider {
|
|
73
|
-
readonly name: string;
|
|
74
|
-
complete(prompt: string, options?: CompletionOptions): Promise<string>;
|
|
75
|
-
completeJSON<T>(prompt: string, schema: unknown): Promise<T>;
|
|
76
|
-
}
|
|
77
|
-
interface CompletionOptions {
|
|
78
|
-
maxTokens?: number;
|
|
79
|
-
temperature?: number;
|
|
80
|
-
systemPrompt?: string;
|
|
81
|
-
}
|
|
82
|
-
/**
|
|
83
|
-
* Forward declaration for Fetcher (defined in fetchers/types.ts)
|
|
84
|
-
*/
|
|
85
|
-
interface Fetcher {
|
|
86
|
-
readonly name: string;
|
|
87
|
-
fetch(url: string, options: FetchOptions): Promise<FetchResult>;
|
|
88
|
-
}
|
|
89
|
-
interface FetchOptions {
|
|
90
|
-
timeout?: number;
|
|
91
|
-
userAgent?: string;
|
|
92
|
-
headers?: Record<string, string>;
|
|
93
|
-
}
|
|
94
|
-
interface FetchResult {
|
|
95
|
-
html: string;
|
|
96
|
-
finalUrl: string;
|
|
97
|
-
statusCode: number;
|
|
98
|
-
contentType: string;
|
|
99
|
-
headers?: Record<string, string>;
|
|
100
|
-
}
|
|
101
|
-
/**
|
|
102
|
-
* Forward declaration for Extractor (defined in extractors/types.ts)
|
|
103
|
-
*/
|
|
104
|
-
interface Extractor {
|
|
105
|
-
readonly name: string;
|
|
106
|
-
readonly priority?: number;
|
|
107
|
-
extract(context: ExtractionContext): Promise<Partial<ScrapedData>>;
|
|
108
|
-
}
|
|
109
|
-
/**
|
|
110
|
-
* Shared context passed to all extractors
|
|
111
|
-
*/
|
|
112
|
-
interface ExtractionContext {
|
|
113
|
-
url: string;
|
|
114
|
-
finalUrl: string;
|
|
115
|
-
html: string;
|
|
116
|
-
$: CheerioAPI;
|
|
117
|
-
getDocument(): Document;
|
|
118
|
-
results: Partial<ScrapedData>;
|
|
119
|
-
options: ScrapeOptions;
|
|
120
|
-
}
|
|
121
|
-
/**
|
|
122
|
-
* Options for scraping
|
|
123
|
-
*/
|
|
124
|
-
interface ScrapeOptions {
|
|
125
|
-
/** Timeout in milliseconds (default: 10000) */
|
|
126
|
-
timeout?: number;
|
|
127
|
-
/** User agent string */
|
|
128
|
-
userAgent?: string;
|
|
129
|
-
/** Whether to extract full content (default: true) */
|
|
130
|
-
extractContent?: boolean;
|
|
131
|
-
/** Maximum content length in characters (default: 50000) */
|
|
132
|
-
maxContentLength?: number;
|
|
133
|
-
/** Custom fetcher (for Puppeteer/Playwright) */
|
|
134
|
-
fetcher?: Fetcher;
|
|
135
|
-
/** Custom extractors to run */
|
|
136
|
-
extractors?: Extractor[];
|
|
137
|
-
/** If true, only run custom extractors (replace defaults) */
|
|
138
|
-
replaceDefaultExtractors?: boolean;
|
|
139
|
-
/** Check robots.txt before scraping (default: false) */
|
|
140
|
-
respectRobots?: boolean;
|
|
141
|
-
/** LLM provider for enhancements */
|
|
142
|
-
llm?: LLMProvider;
|
|
143
|
-
/** LLM enhancement types to run */
|
|
144
|
-
enhance?: EnhancementType[];
|
|
145
|
-
/** Schema for structured LLM extraction */
|
|
146
|
-
extract?: ExtractionSchema;
|
|
147
|
-
}
|
|
148
|
-
//#endregion
|
|
149
|
-
export { ExtractedLink as a, ExtractionSchemaType as c, FetchResult as d, Fetcher as f, ScrapedData as h, ExtractedEntities as i, Extractor as l, ScrapeOptions as m, ContentType as n, ExtractionContext as o, LLMProvider as p, EnhancementType as r, ExtractionSchema as s, CompletionOptions as t, FetchOptions as u };
|
|
150
|
-
//# sourceMappingURL=types-CNQZVW36.d.mts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"types-CNQZVW36.d.mts","names":[],"sources":["../src/core/types.ts"],"sourcesContent":[],"mappings":";;;;;;AAKA;AAaiB,KAbL,WAAA,GAakB,SAAA,GAAA,MAAA,GAAA,MAAA,GAAA,SAAA,GAAA,OAAA,GAAA,MAAA,GAAA,SAAA,GAAA,SAAA;AAS9B;AAWA;;AA8BW,UAlDM,aAAA,CAkDN;EAGD,GAAA,EAAA,MAAA;EAKG,IAAA,EAAA,MAAA;EACC,UAAA,EAAA,OAAA;;;AAcd;AAKA;AAQY,UA7EK,iBAAA,CA6E6B;EAK7B,MAAA,EAAA,MAAW,EAAA;EAES,aAAA,EAAA,MAAA,EAAA;EAAoB,YAAA,EAAA,MAAA,EAAA;EACG,SAAA,EAAA,MAAA,EAAA;EAAR,QAAA,EAAA,MAAA,EAAA;;AAGpD;AASA;;AAEqD,UAxFpC,WAAA,CAwFoC;EAAR,GAAA,EAAA,MAAA;EAAO,YAAA,EAAA,MAAA;EAGnC,MAAA,EAAA,MAAA;EAMA,KAAA,EAAA,MAAA;EAWA,WAAA,EAAS,MAAA;EAGP,KAAA,CAAA,EAAA,MAAA;EAAoC,OAAA,CAAA,EAAA,MAAA;EAAR,OAAA,EAAA,MAAA;EAAR,WAAA,EAAA,MAAA;EAAO,OAAA,EAAA,MAAA;EAM7B,SAAA,EAAA,MAAA;EAOZ,MAAA,CAAA,EAAA,MAAA;EAGY,WAAA,CAAA,EAAA,MAAA;EAGE,UAAA,CAAA,EAAA,MAAA;EAAR,QAAA,CAAA,EAAA,MAAA;EAGA,QAAA,CAAA,EAAA,MAAA;EAAa,WAAA,EA3GT,WA2GS;EAMP,QAAA,EAAA,MAAa,EAAA;EAclB,MAAA,CAAA,EA3HD,MA2HC,CAAA,MAAA,EAAA,OAAA,CAAA,EAAA;EAGG,KAAA,CAAA,EA3HL,aA2HK,EAAA;EASP,OAAA,CAAA,EAAA,MAAA;EAGI,aAAA,CAAA,EAAA,MAAA,EAAA;EAGA,QAAA,CAAA,EArIC,iBAqID;EAAgB,SAAA,CAAA,EApId,MAoIc,CAAA,MAAA,EAAA,OAAA,CAAA;WAjIjB;;;;;;;;KAWC,eAAA;;;;KAKA,oBAAA;KAQA,gBAAA,GAAmB,eAAe;;;;UAK7B,WAAA;;qCAEoB,oBAAoB;oDACL,QAAQ;;UAG3C,iBAAA;;;;;;;;UASA,OAAA;;8BAEa,eAAe,QAAQ;;UAGpC,YAAA;;;YAGL;;UAGK,WAAA;;;;;YAKL;;;;;UAMK,SAAA;;;mBAGE,oBAAoB,QAAQ,QAAQ;;;;;UAMtC,iBAAA;;;;KAOZ;iBAGY;WAGN,QAAQ;WAGR;;;;;UAMM,aAAA;;;;;;;;;;YAcL;;eAGG;;;;;;QASP;;YAGI;;YAGA"}
|
|
@@ -1,150 +0,0 @@
|
|
|
1
|
-
import { CheerioAPI } from "cheerio";
|
|
2
|
-
|
|
3
|
-
//#region src/core/types.d.ts
|
|
4
|
-
|
|
5
|
-
/**
|
|
6
|
-
* Content type classification for scraped URLs
|
|
7
|
-
*/
|
|
8
|
-
type ContentType = 'article' | 'repo' | 'docs' | 'package' | 'video' | 'tool' | 'product' | 'unknown';
|
|
9
|
-
/**
|
|
10
|
-
* Extracted link from content
|
|
11
|
-
*/
|
|
12
|
-
interface ExtractedLink {
|
|
13
|
-
url: string;
|
|
14
|
-
text: string;
|
|
15
|
-
isExternal: boolean;
|
|
16
|
-
}
|
|
17
|
-
/**
|
|
18
|
-
* Extracted entities from LLM enhancement
|
|
19
|
-
*/
|
|
20
|
-
interface ExtractedEntities {
|
|
21
|
-
people: string[];
|
|
22
|
-
organizations: string[];
|
|
23
|
-
technologies: string[];
|
|
24
|
-
locations: string[];
|
|
25
|
-
concepts: string[];
|
|
26
|
-
}
|
|
27
|
-
/**
|
|
28
|
-
* Main result of metadata scraping - optimized for LLM consumption
|
|
29
|
-
*/
|
|
30
|
-
interface ScrapedData {
|
|
31
|
-
url: string;
|
|
32
|
-
canonicalUrl: string;
|
|
33
|
-
domain: string;
|
|
34
|
-
title: string;
|
|
35
|
-
description: string;
|
|
36
|
-
image?: string;
|
|
37
|
-
favicon?: string;
|
|
38
|
-
content: string;
|
|
39
|
-
textContent: string;
|
|
40
|
-
excerpt: string;
|
|
41
|
-
wordCount: number;
|
|
42
|
-
author?: string;
|
|
43
|
-
publishedAt?: string;
|
|
44
|
-
modifiedAt?: string;
|
|
45
|
-
siteName?: string;
|
|
46
|
-
language?: string;
|
|
47
|
-
contentType: ContentType;
|
|
48
|
-
keywords: string[];
|
|
49
|
-
jsonLd?: Record<string, unknown>[];
|
|
50
|
-
links?: ExtractedLink[];
|
|
51
|
-
summary?: string;
|
|
52
|
-
suggestedTags?: string[];
|
|
53
|
-
entities?: ExtractedEntities;
|
|
54
|
-
extracted?: Record<string, unknown>;
|
|
55
|
-
custom?: Record<string, unknown>;
|
|
56
|
-
scrapedAt: string;
|
|
57
|
-
scrapeTimeMs: number;
|
|
58
|
-
error?: string;
|
|
59
|
-
}
|
|
60
|
-
/**
|
|
61
|
-
* LLM enhancement types
|
|
62
|
-
*/
|
|
63
|
-
type EnhancementType = 'summarize' | 'tags' | 'entities' | 'classify';
|
|
64
|
-
/**
|
|
65
|
-
* Schema for structured LLM extraction
|
|
66
|
-
*/
|
|
67
|
-
type ExtractionSchemaType = 'string' | 'number' | 'boolean' | 'string[]' | 'number[]' | `${string}?`;
|
|
68
|
-
type ExtractionSchema = Record<string, ExtractionSchemaType>;
|
|
69
|
-
/**
|
|
70
|
-
* Forward declaration for LLM provider (defined in llm/types.ts)
|
|
71
|
-
*/
|
|
72
|
-
interface LLMProvider {
|
|
73
|
-
readonly name: string;
|
|
74
|
-
complete(prompt: string, options?: CompletionOptions): Promise<string>;
|
|
75
|
-
completeJSON<T>(prompt: string, schema: unknown): Promise<T>;
|
|
76
|
-
}
|
|
77
|
-
interface CompletionOptions {
|
|
78
|
-
maxTokens?: number;
|
|
79
|
-
temperature?: number;
|
|
80
|
-
systemPrompt?: string;
|
|
81
|
-
}
|
|
82
|
-
/**
|
|
83
|
-
* Forward declaration for Fetcher (defined in fetchers/types.ts)
|
|
84
|
-
*/
|
|
85
|
-
interface Fetcher {
|
|
86
|
-
readonly name: string;
|
|
87
|
-
fetch(url: string, options: FetchOptions): Promise<FetchResult>;
|
|
88
|
-
}
|
|
89
|
-
interface FetchOptions {
|
|
90
|
-
timeout?: number;
|
|
91
|
-
userAgent?: string;
|
|
92
|
-
headers?: Record<string, string>;
|
|
93
|
-
}
|
|
94
|
-
interface FetchResult {
|
|
95
|
-
html: string;
|
|
96
|
-
finalUrl: string;
|
|
97
|
-
statusCode: number;
|
|
98
|
-
contentType: string;
|
|
99
|
-
headers?: Record<string, string>;
|
|
100
|
-
}
|
|
101
|
-
/**
|
|
102
|
-
* Forward declaration for Extractor (defined in extractors/types.ts)
|
|
103
|
-
*/
|
|
104
|
-
interface Extractor {
|
|
105
|
-
readonly name: string;
|
|
106
|
-
readonly priority?: number;
|
|
107
|
-
extract(context: ExtractionContext): Promise<Partial<ScrapedData>>;
|
|
108
|
-
}
|
|
109
|
-
/**
|
|
110
|
-
* Shared context passed to all extractors
|
|
111
|
-
*/
|
|
112
|
-
interface ExtractionContext {
|
|
113
|
-
url: string;
|
|
114
|
-
finalUrl: string;
|
|
115
|
-
html: string;
|
|
116
|
-
$: CheerioAPI;
|
|
117
|
-
getDocument(): Document;
|
|
118
|
-
results: Partial<ScrapedData>;
|
|
119
|
-
options: ScrapeOptions;
|
|
120
|
-
}
|
|
121
|
-
/**
|
|
122
|
-
* Options for scraping
|
|
123
|
-
*/
|
|
124
|
-
interface ScrapeOptions {
|
|
125
|
-
/** Timeout in milliseconds (default: 10000) */
|
|
126
|
-
timeout?: number;
|
|
127
|
-
/** User agent string */
|
|
128
|
-
userAgent?: string;
|
|
129
|
-
/** Whether to extract full content (default: true) */
|
|
130
|
-
extractContent?: boolean;
|
|
131
|
-
/** Maximum content length in characters (default: 50000) */
|
|
132
|
-
maxContentLength?: number;
|
|
133
|
-
/** Custom fetcher (for Puppeteer/Playwright) */
|
|
134
|
-
fetcher?: Fetcher;
|
|
135
|
-
/** Custom extractors to run */
|
|
136
|
-
extractors?: Extractor[];
|
|
137
|
-
/** If true, only run custom extractors (replace defaults) */
|
|
138
|
-
replaceDefaultExtractors?: boolean;
|
|
139
|
-
/** Check robots.txt before scraping (default: false) */
|
|
140
|
-
respectRobots?: boolean;
|
|
141
|
-
/** LLM provider for enhancements */
|
|
142
|
-
llm?: LLMProvider;
|
|
143
|
-
/** LLM enhancement types to run */
|
|
144
|
-
enhance?: EnhancementType[];
|
|
145
|
-
/** Schema for structured LLM extraction */
|
|
146
|
-
extract?: ExtractionSchema;
|
|
147
|
-
}
|
|
148
|
-
//#endregion
|
|
149
|
-
export { ExtractedLink as a, ExtractionSchemaType as c, FetchResult as d, Fetcher as f, ScrapedData as h, ExtractedEntities as i, Extractor as l, ScrapeOptions as m, ContentType as n, ExtractionContext as o, LLMProvider as p, EnhancementType as r, ExtractionSchema as s, CompletionOptions as t, FetchOptions as u };
|
|
150
|
-
//# sourceMappingURL=types-D0HYR95H.d.cts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"types-D0HYR95H.d.cts","names":[],"sources":["../src/core/types.ts"],"sourcesContent":[],"mappings":";;;;;;AAKA;AAaiB,KAbL,WAAA,GAakB,SAAA,GAAA,MAAA,GAAA,MAAA,GAAA,SAAA,GAAA,OAAA,GAAA,MAAA,GAAA,SAAA,GAAA,SAAA;AAS9B;AAWA;;AA8BW,UAlDM,aAAA,CAkDN;EAGD,GAAA,EAAA,MAAA;EAKG,IAAA,EAAA,MAAA;EACC,UAAA,EAAA,OAAA;;;AAcd;AAKA;AAQY,UA7EK,iBAAA,CA6E6B;EAK7B,MAAA,EAAA,MAAW,EAAA;EAES,aAAA,EAAA,MAAA,EAAA;EAAoB,YAAA,EAAA,MAAA,EAAA;EACG,SAAA,EAAA,MAAA,EAAA;EAAR,QAAA,EAAA,MAAA,EAAA;;AAGpD;AASA;;AAEqD,UAxFpC,WAAA,CAwFoC;EAAR,GAAA,EAAA,MAAA;EAAO,YAAA,EAAA,MAAA;EAGnC,MAAA,EAAA,MAAA;EAMA,KAAA,EAAA,MAAA;EAWA,WAAA,EAAS,MAAA;EAGP,KAAA,CAAA,EAAA,MAAA;EAAoC,OAAA,CAAA,EAAA,MAAA;EAAR,OAAA,EAAA,MAAA;EAAR,WAAA,EAAA,MAAA;EAAO,OAAA,EAAA,MAAA;EAM7B,SAAA,EAAA,MAAA;EAOZ,MAAA,CAAA,EAAA,MAAA;EAGY,WAAA,CAAA,EAAA,MAAA;EAGE,UAAA,CAAA,EAAA,MAAA;EAAR,QAAA,CAAA,EAAA,MAAA;EAGA,QAAA,CAAA,EAAA,MAAA;EAAa,WAAA,EA3GT,WA2GS;EAMP,QAAA,EAAA,MAAa,EAAA;EAclB,MAAA,CAAA,EA3HD,MA2HC,CAAA,MAAA,EAAA,OAAA,CAAA,EAAA;EAGG,KAAA,CAAA,EA3HL,aA2HK,EAAA;EASP,OAAA,CAAA,EAAA,MAAA;EAGI,aAAA,CAAA,EAAA,MAAA,EAAA;EAGA,QAAA,CAAA,EArIC,iBAqID;EAAgB,SAAA,CAAA,EApId,MAoIc,CAAA,MAAA,EAAA,OAAA,CAAA;WAjIjB;;;;;;;;KAWC,eAAA;;;;KAKA,oBAAA;KAQA,gBAAA,GAAmB,eAAe;;;;UAK7B,WAAA;;qCAEoB,oBAAoB;oDACL,QAAQ;;UAG3C,iBAAA;;;;;;;;UASA,OAAA;;8BAEa,eAAe,QAAQ;;UAGpC,YAAA;;;YAGL;;UAGK,WAAA;;;;;YAKL;;;;;UAMK,SAAA;;;mBAGE,oBAAoB,QAAQ,QAAQ;;;;;UAMtC,iBAAA;;;;KAOZ;iBAGY;WAGN,QAAQ;WAGR;;;;;UAMM,aAAA;;;;;;;;;;YAcL;;eAGG;;;;;;QASP;;YAGI;;YAGA"}
|