varak-chunker 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +31 -0
- package/dist/index.js +1 -0
- package/index.d.ts +155 -0
- package/package.json +16 -0
package/README.md
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# varak-chunker
|
|
2
|
+
|
|
3
|
+
Thai legal document chunking & OCR pipeline for RAG systems.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npm install github:YOUR_ORG/varak-chunker-dist
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Peer dependency
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
npm install pdfjs-dist
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Usage
|
|
18
|
+
|
|
19
|
+
```js
|
|
20
|
+
import {
|
|
21
|
+
extractPdfText,
|
|
22
|
+
chunkBySections,
|
|
23
|
+
segmentVarakByRules,
|
|
24
|
+
classifyPdf,
|
|
25
|
+
} from 'varak-chunker';
|
|
26
|
+
|
|
27
|
+
const text = await extractPdfText(pdfBuffer);
|
|
28
|
+
const chunks = await chunkBySections(text);
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
See `index.d.ts` for full API.
|
package/dist/index.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
const _0x4f3251=_0x2a5f;function _0x2a5f(_0x4b548a,_0xe34067){_0x4b548a=_0x4b548a-0x147;const _0x5afa45=_0x5afa();let _0x2a5fa7=_0x5afa45[_0x4b548a];if(_0x2a5f['rHojjU']===undefined){var _0x29faa9=function(_0x519220){const _0x276682='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+/=';let _0x2e5725='',_0x35f9ab='';for(let _0x4e2768=0x0,_0x516192,_0x9b1d42,_0x1f9a7f=0x0;_0x9b1d42=_0x519220['charAt'](_0x1f9a7f++);~_0x9b1d42&&(_0x516192=_0x4e2768%0x4?_0x516192*0x40+_0x9b1d42:_0x9b1d42,_0x4e2768++%0x4)?_0x2e5725+=String['fromCharCode'](0xff&_0x516192>>(-0x2*_0x4e2768&0x6)):0x0){_0x9b1d42=_0x276682['indexOf'](_0x9b1d42);}for(let _0x315622=0x0,_0x42ce8d=_0x2e5725['length'];_0x315622<_0x42ce8d;_0x315622++){_0x35f9ab+='%'+('00'+_0x2e5725['charCodeAt'](_0x315622)['toString'](0x10))['slice'](-0x2);}return decodeURIComponent(_0x35f9ab);};const _0x2043cd=function(_0x5af4ae,_0x4cacf2){let _0x5e44ba=[],_0x546b1b=0x0,_0x3d4593,_0x38d482='';_0x5af4ae=_0x29faa9(_0x5af4ae);let _0x47c8c;for(_0x47c8c=0x0;_0x47c8c<0x100;_0x47c8c++){_0x5e44ba[_0x47c8c]=_0x47c8c;}for(_0x47c8c=0x0;_0x47c8c<0x100;_0x47c8c++){_0x546b1b=(_0x546b1b+_0x5e44ba[_0x47c8c]+_0x4cacf2['charCodeAt'](_0x47c8c%_0x4cacf2['length']))%0x100,_0x3d4593=_0x5e44ba[_0x47c8c],_0x5e44ba[_0x47c8c]=_0x5e44ba[_0x546b1b],_0x5e44ba[_0x546b1b]=_0x3d4593;}_0x47c8c=0x0,_0x546b1b=0x0;for(let _0x13f290=0x0;_0x13f290<_0x5af4ae['length'];_0x13f290++){_0x47c8c=(_0x47c8c+0x1)%0x100,_0x546b1b=(_0x546b1b+_0x5e44ba[_0x47c8c])%0x100,_0x3d4593=_0x5e44ba[_0x47c8c],_0x5e44ba[_0x47c8c]=_0x5e44ba[_0x546b1b],_0x5e44ba[_0x546b1b]=_0x3d4593,_0x38d482+=String['fromCharCode'](_0x5af4ae['charCodeAt'](_0x13f290)^_0x5e44ba[(_0x5e44ba[_0x47c8c]+_0x5e44ba[_0x546b1b])%0x100]);}return _0x38d482;};_0x2a5f['XpAYVW']=_0x2043cd,_0x2a5f['euqDrP']={},_0x2a5f['rHojjU']=!![];}const _0x455da2=_0x5afa45[0x0],_0x248e52=_0x4b548a+_0x455da2,_0x5a2da3=_0x2a5f['euqDrP'][_0x248e52];return!_0x5a2da3?(_0x2a5f['BdyFmb']===undefined&&(_0x2a5f['BdyFmb']=!![]),_0x2a5fa7=_0x2a5f['XpAYVW'](_0x2a5fa7,_0xe34067),_0x2a5f['euqDrP'][_0x248e52]=_0x2a5fa7):_0x2a5fa7=_0x5a2da3,_0x2a5fa7;}(function(_0x526648,_0x2e1245){const _0x16cc4b=_0x2a5f,_0x51e88d=_0x526648();while(!![]){try{const _0x15ef50=parseInt(_0x16cc4b(0x1c1,'q2mg'))/0x1*(-parseInt(_0x16cc4b(0x17f,'HzW3'))/0x2)+parseInt(_0x16cc4b(0x220,'rza)'))/0x3*(-parseInt(_0x16cc4b(0x1b1,'Rx(M'))/0x4)+-parseInt(_0x16cc4b(0x1bb,'E^Gu'))/0x5*(-parseInt(_0x16cc4b(0x20a,'BwKV'))/0x6)+-parseInt(_0x16cc4b(0x20e,'ZWcJ'))/0x7*(-parseInt(_0x16cc4b(0x186,'rza)'))/0x8)+parseInt(_0x16cc4b(0x1d3,'rza)'))/0x9+parseInt(_0x16cc4b(0x1cc,'QaQ#'))/0xa+-parseInt(_0x16cc4b(0x1de,'sS1W'))/0xb;if(_0x15ef50===_0x2e1245)break;else _0x51e88d['push'](_0x51e88d['shift']());}catch(_0x254b2e){_0x51e88d['push'](_0x51e88d['shift']());}}}(_0x5afa,0x4f969));var THAI_DIGITS=_0x4f3251(0x259,'oEXD');function normalizeDigits(_0x569291){const _0x4a920f=_0x4f3251;return _0x569291[_0x4a920f(0x221,'8It*')](/[๐-๙]/g,_0x48e34d=>THAI_DIGITS[_0x4a920f(0x218,'&TGz')](_0x48e34d));}function extractChapter(_0x25615f){const _0x5a463b=_0x4f3251,_0x399a72=_0x25615f['match'](/^หมวด\s+[\d๐-๙]+/);if(!_0x399a72)return null;return normalizeDigits(_0x399a72[0x0])[_0x5a463b(0x1b7,'Hc5G')](/\s+/g,'\x20')[_0x5a463b(0x210,'ENsj')]();}function extractChapterTitle(_0x4c764d){const _0x392f45=_0x4f3251,_0x4cb4e9=_0x4c764d[_0x392f45(0x14e,'z[&C')](/^หมวด[ \t]+[\d๐-๙]+[ \t]+(.+)/);if(_0x4cb4e9)return _0x4cb4e9[0x1][_0x392f45(0x21d,'0Jbg')]();const _0x4dfc33=_0x4c764d[_0x392f45(0x21b,'BwKV')]('\x0a')[_0x392f45(0x267,'QaQ#')](_0x43a66f=>_0x43a66f[_0x392f45(0x201,'uC@)')]())[_0x392f45(0x217,'fLE0')](_0x4e189e=>_0x4e189e['length']>0x0),_0x1c9eba=_0x4dfc33[0x1];if(_0x1c9eba&&!/^(?:มาตรา|ข้อ)\s+[\d๐-๙]+/[_0x392f45(0x23d,'fLE0')](_0x1c9eba))return _0x1c9eba;return null;}function _0x5afa(){const _0x56496d=['nK7cRW','W4bNvCoFc8kVWRJdUW','s8kZaa','W5pcRCk6mGRdLg0','WQtdKJJdPmk/cmkfF8keWPnEW49PW4FdUG','WPddVmo7ALVcH0WwnvmcW6y','yXxcNa','z0/dJcxcLmkFw8oe','W6mxav/cOCklW6e4W69LWPiQ','eclcKSoFWOpcPG','WPJdJxudFW','WO9Gwa','BmkgWPtcQWJcQM0U','d8kGWR0auq','WQumWPhdHCojkmk6','bmoIyt13','agpdJW','WQGYW49qA8kKW5q','kCoDW7BdQ03dTMSrW7SFCCkk','wCkeWP3cJJW','WRHlWPVdKwzh','zd/cKx/cULO8WRRcLCkYWQhcSSknWOy','W4NcJmkSWR4/FmkhWP96','pqLPWPZcRW','WQlcM395Ec97W74km8o+qG','bwFdJ8oaWOVcTxK','mmkQW5ldKa','W5tcQrFcOc0','W4ZdUCo7','WO9MwW','ggpdJSohWPK','WPGCWQZdRCop','qdJcIutcVG','W5ykzCoUWQq','WPeiWRFcKIq','W4nrW6RdKeJdTmkVoXPgW6xdOgq','wSkXlw0Nq8o9W6hdP0hcTqddQq','WR4wzCobcq','W5xcUSkJmW','WQCrBW','W5GtrCoR','g3ldKmoAWP4','W7hcJrXtmM94dL5PWPTzW6e','WQugWPxdImoClG','WOCVyCoana','mmk0iwrP','FvxdNq','WRKCW4y','yXlcNq','WQ4dW4DVAa','W43cMKOUD2m','WQjwqWm','WRiYWRZcHYq','WP1ugSk+W5jWW6vestCCzW','sM7dOGtcGmkptmoEWRy','W4/cLSk6','WQhcPGO','WQiZWPG','ichdTK3cVaZcTq','hCojWPNdRNBcPG','WRBdJuuoEtGLg2u','pwLCtNVcUSk6','aKGvW6W8jCoeW5a','W4FcLmo9W57cV8ktAq','heGlW5q','dZpcJSorWPNcQq','bvaq','WRTnva7dQCo0W4m','WR3dTNldKSoS','expdJq','WQBdSt8AWR0','awhdImo7WPm','WPLRoq','BmkBWQ7cSG','WRtdGJKJWOOw','sfndjSoK','aq9j','WRfyWRldTuu','WQ/dNLasyYzI','WPhdGZm2','n8kKW4/dNCk4','W4ewsCodWO0','WQddIKigBYW','fSojWOpdJw3cRvaSrCkBkG','gmo+y8o3W7y','WQrBWOBdNG','DctcVW','WOz8yG','W6xcOqJcHG','WOaZWOVcQqu','WPhdQGNdIq','WQrIAq','W4VcU1mEva','B2tcU38y','WPldRXBdImkp','o21c','W73cOa/cJmkXtmo8WQ58jt/dNa','cY3cGgmMvfPw','4lQ14lU34lIz4lIr4lQW4lI34lQJ4lMH4lQo4lUY','WRKFW4q','W4ddQWOwcvvkW40CkSoD','WP3dKSo3W6FcNCkdvYK','gCoXEJn2aCkQ','WP8/oCkj','WQRdRbieWRa','lSk3W5RdKmkJxXddMra','lmoovXTBlCkBW4/dVeRcKZddUG','rConWPtcQaNcQIG','oHtcM3VdLCoidSoJWO/dJ2qnba','jI3dQ0tcSHRcPa','W6O5DSohWRGj','W4JcPSkUoXpdUg4','WPhdLciV','W4KtvCo9WPm6W6aovmkyfCki','tgxcThWz','uSoqWO/cTW','ogD/dq','W4ZcV8oW','btdcJG','W7BcIMmWsd8bgG','gCkuWPWREmoVW6G','W4aeqSoHWP8R','WQLWW4xdJH/dKColDJz5WPhdSSkSDZKe','W44YW6a','W4dcHYVcQHa','vSodWPtcRG8','W7BdGXiCpYj7ft9BW4jaWPrLcW','WOSNi8kaWRldJc7cI1xcRa','pYJcQmoVWQa','WQbpW4BdRIa','WQa2w0u9','vCowWOFcQaJcQqVdKCkbbW','zMvh','WOiJm8kiWR8DWO3dLa','valcVvZcQeGHWQhcGa','W7ZdIIiUpNi','iJGy','y0ddNW','WRLkmW','WQmmWPldNq','nK7cQa','E8olkqnd','WRrkxb/dTmoj','WRuACG','W73cHcpcNqy','W4aax8oYWRWQW6b5tW','WOaTzmoe','WQSCjCkkWRK','W7G1ACoh','a3auW7eK','W78ZE8oFWRG','W4pdLCkn','vadcPr8a','WONdGLZdNCoE','hmkDW4u','Bxzye8oWiY5DpaO','WPepW49zBa','W6eLDSohWRqlW444','WOmwWR/cJa/cOCoWhty','g2/dL8ogWRK','ywZdGWtcMa','FxNdGG','4lMi4lQk4lMb4lMk4lIu4lMDyWq','DhFdVa','WR0InfO','eSkEofXB','bmkYiq','WOddNSkq','WPxdUGNdLq','EcFcVa','WPLSpG','W6lcHmonW53cLq','yXlcMW','vSo0qmkd','s2FcUxSECevtW7e','kwvEuw7cGq','4lM24lUq4lMH4lQ44lM84lQ+4lMG4lI/4lUr4lU+','thBcUxKi','CYRcPJi','ACoIASkcFW','EgTfcG','DmkmWQNcUa7cQW','WPG6y8oonmkf','WOJdKYC2WO0sWPLp','vSo5xmksyW','WQqnCKq','WQJdOYebWOu','WR1aWPBdMMDlEZa','WQddScKNWRW','WQ85mupdMSk3','WPOfWQ4','W5pdQ8kZWPpdU2PJ','WPBdHsOUWOeSWPPyWOpdHq7dQ8o/W7VdQa','W4fqga','WQzruqxdOSoDW4O+W4S','s8kZaq','ltjdWRVcSa','kSkQo2HVFgq','WQLRW4BdLZZdJW','4lUG4lID4lMhW4tdS8kt','DhBdVG','khddUwrykMyCW4pcNxPS','zCognano','hmkDW4q','WQ0Em8kIWOm','y8oVWQtcNZJcHqJdSCk4kSk1cmk8','WQOIW4Xu','WR5cqX/dIq','WPvMsrVdLq','W43dQ8kTWPJdRMe','W77cGaZcOre','stlcPWqB','W4ZcIKG9ANVcMsK','DhtdUW','l8k+WQexDSoLW6pdGa','sSo2gIvn','EmosmWG','WOCrWQ3cIG','WRBdNKO','Fx7dGG','W5nKuSoCW67cLbZcOgRcR04j','WOVcMmooW6tdPCo1lvqzESozWOpcN8kAxCky','uIhdQW','WO81gG','WPSbWRdcHqJcRW','WOmwWRFcJW','WR93W5NdIW','qmolWORcRHNcQa','l3PYbq','WPLLWOBdMhS','WQujWRtcIYO','WQDlWPBdGNTaCa','WPG5mCkqWR8kW4JdHgZdUSolcCkP','u3dcTNizFG','eGtdSMFcVq','bwbAkSkN','hLGiW5a8iCoe','bafpWRNcNmoI','oCkQW4G','fmklvmotBqpcH8oVW57dJCkgWR3cILq','WRHmwqu','vCosWORcSWG','W5Gex8oY','W7ZcNdBcVaO','W5Krx8oJWPK3','iYhdV1ZcUW','bCoMEZLRgG','WQDfWPddGuS','jd3dQea','WQ0Egg3dHG','W4ZcV8oY','W6G0W6zrBSkDW6a','yKtdGs7cLCkt','h00uW5uP','W4ZdUSo7','W5hdPYmEdq','WPSzo1VdKG','WQbCWPZdMW','W7pcJXXym2Gmk0nDWP5R','s8kYaq','bvat','WO/dUK4JAW','p2fK','WQtdOsmMWR0','W5RcLCovW4e','i2Lcrw7cSq','WO4XbmkdWQpdIG','bCoIzIPig8kRW7ldMq','W4ZdR8kZ','W4tcImovW4i','dCkdWOmQsCoJW6tdMG0','WOxdUCoUBa','WOz/yq','omovDmog','WRGDB0SKFG','m8kRW5JdKSkLxrRdMa','WPKOimkiWQSAWO0','s2FcSxG','i8kQzG','WOfnW7xdTXZdPCoW','W4pcKSoqW5JcPmkcFdi','W6ddMJ8H','WRKFW4C','vSoxWPxcSG','W6xcTHlcNW','WOz+yq','W7veWQOVW58','mqxdRLbu','WPOSWOVdQSoj','WQjavGBdMq','W4ZcL8k7WRm','W49KWR0yW7VcIG','WOuICmkbWRinWPRdHMddQSops8oXWQvRAsLZyq'];_0x5afa=function(){return _0x56496d;};return _0x5afa();}var CHUNK_SIZE=0x1f4,CHUNK_OVERLAP=0x32;function normalizeText(_0x2d9162){const _0x455274=_0x4f3251,_0x2c4067={'EzvKe':_0x455274(0x14b,'&gdT'),'rzvrU':_0x455274(0x193,'&TGz'),'tNZim':'มาตรา\x20$1'},_0x5f05d8='8|0|12|2|1|6|11|3|5|9|7|4|10'[_0x455274(0x1d2,'(grQ')]('|');let _0x3467d0=0x0;while(!![]){switch(_0x5f05d8[_0x3467d0++]){case'0':_0x2d9162=normalizeDigits(_0x2d9162);continue;case'1':_0x2d9162=_0x2d9162[_0x455274(0x198,'ENsj')](/\n(หน้า\s+\d+[^\n]*\nเล่ม\s+\d+[^\n]*ราชกิจจานุเบกษา[^\n]*\n(?:\s*\n)*)/g,'\x0a');continue;case'2':_0x2d9162=_0x2d9162[_0x455274(0x268,'P1ZY')](/\n\n(หน้า\s+\d+[^\n]*\nเล่ม\s+\d+[^\n]*ราชกิจจานุเบกษา[^\n]*\n)(?!\n)/g,'\x0a');continue;case'3':_0x2d9162=_0x2d9162['replace'](/\n+สํานักงานคณะกรรมการกฤษฎีกา\n+/g,'\x0a\x0a');continue;case'4':_0x2d9162=_0x2d9162[_0x455274(0x1ac,'&TGz')](/ข้อ\s*([\d]+)/g,_0x2c4067[_0x455274(0x1c4,'uC@)')]);continue;case'5':_0x2d9162=_0x2d9162[_0x455274(0x268,'P1ZY')](/-\s*\d+\s*-/g,'');continue;case'6':_0x2d9162=_0x2d9162['replace'](/([ก-ฮ])\n([\u0E30-\u0E4E])/g,_0x2c4067['rzvrU']);continue;case'7':_0x2d9162=_0x2d9162['replace'](/มาตรา\s*([\d]+)/g,_0x2c4067[_0x455274(0x22b,'rTdk')]);continue;case'8':_0x2d9162=_0x2d9162['replace'](/[\uD800-\uDFFF\u200B\u00AD]/g,'');continue;case'9':_0x2d9162=_0x2d9162['replace'](/\n{3,}/g,'\x0a\x0a');continue;case'10':return _0x2d9162[_0x455274(0x18c,'ZWcJ')]();case'11':_0x2d9162=_0x2d9162['replace'](/\n(สํานักงานคณะกรรมการกฤษฎีกา\n+-\s*\d+\s*-\n+สํานักงานคณะกรรมการกฤษฎีกา)\n+/g,'\x0a');continue;case'12':_0x2d9162=_0x2d9162[_0x455274(0x216,'*4V5')](/\n\n(หน้า\s+\d+[^\n]*\nเล่ม\s+\d+[^\n]*ราชกิจจานุเบกษา[^\n]*\n(?:\s*\n)+)/g,'\x0a\x0a');continue;}break;}}function chunkByTokens(_0x28bb17,_0x5106b0=CHUNK_SIZE*0x4){const _0x3a6962=_0x4f3251,_0x247700={'mOWWB':function(_0x24646e,_0x2cfdd3){return _0x24646e-_0x2cfdd3;},'MKsni':function(_0x28330f,_0x1fe0fd){return _0x28330f*_0x1fe0fd;},'ictHy':function(_0x360d74,_0x63a202){return _0x360d74+_0x63a202;}},_0x2ea48c=[];let _0x326464=0x0;const _0xf97b9c=_0x247700[_0x3a6962(0x177,'uC@)')](_0x5106b0,_0x247700[_0x3a6962(0x169,'KUnX')](CHUNK_OVERLAP,0x4));while(_0x326464<_0x28bb17[_0x3a6962(0x25f,'6mAj')]){_0x2ea48c[_0x3a6962(0x1a6,'S[06')](_0x28bb17[_0x3a6962(0x236,'z[&C')](_0x326464,_0x247700[_0x3a6962(0x1f0,'(grQ')](_0x326464,_0x5106b0))[_0x3a6962(0x1d1,'sS1W')]()),_0x326464+=_0xf97b9c;}return _0x2ea48c['filter'](_0x4cce94=>_0x4cce94['length']>0x0);}function chunkByParagraphs(_0x19f503,_0x56fa2a=0x5dc){const _0x4612ff=_0x4f3251,_0x5b203e={'MFSpP':function(_0x47d25a,_0xbfca45,_0x42e9c0){return _0x47d25a(_0xbfca45,_0x42e9c0);}},_0x2bc8a1=_0x19f503['split'](/\n\n+/)[_0x4612ff(0x232,'x!jj')](_0x2be052=>_0x2be052['trim']())['filter'](_0x3da756=>_0x3da756['length']>=0x14);if(_0x2bc8a1[_0x4612ff(0x196,'rTdk')]<=0x1)return _0x5b203e[_0x4612ff(0x1fa,'fLE0')](chunkByTokens,_0x19f503,_0x56fa2a);const _0xc232dc=[];let _0x596a22='';for(const _0xb75cd9 of _0x2bc8a1){if(_0xb75cd9['length']>_0x56fa2a){_0x596a22[_0x4612ff(0x24d,'B2Lf')]()&&(_0xc232dc[_0x4612ff(0x15d,'QaQ#')](_0x596a22[_0x4612ff(0x219,'AwG@')]()),_0x596a22='');_0xc232dc['push'](...chunkByTokens(_0xb75cd9,_0x56fa2a));continue;}const _0x48a89f=_0x596a22?_0x596a22+'\x0a\x0a'+_0xb75cd9:_0xb75cd9;_0x48a89f[_0x4612ff(0x155,'P1ZY')]>_0x56fa2a&&_0x596a22?(_0xc232dc[_0x4612ff(0x166,'Z(oZ')](_0x596a22[_0x4612ff(0x185,'KUnX')]()),_0x596a22=_0xb75cd9):_0x596a22=_0x48a89f;}if(_0x596a22[_0x4612ff(0x23b,'6mAj')]())_0xc232dc[_0x4612ff(0x15d,'QaQ#')](_0x596a22[_0x4612ff(0x24d,'B2Lf')]());return _0xc232dc[_0x4612ff(0x266,'B2Lf')](_0x28cf98=>_0x28cf98[_0x4612ff(0x1b2,'PHdi')]>0x0);}function extractSection(_0x54a81f){const _0x53e65c=_0x4f3251,_0x35b610=_0x54a81f[_0x53e65c(0x1f9,'Tf43')](/^((?:มาตรา|ข้อ)\s+[\d๐-๙]+(?:[/-][\d๐-๙]+)?)/);if(!_0x35b610)return null;return _0x35b610[0x1][_0x53e65c(0x1e6,'oqoT')](/[๐-๙]/g,_0x2ff36d=>_0x53e65c(0x20b,'AwG@')[_0x53e65c(0x1ec,'Rx(M')](_0x2ff36d));}var CHAPTER_RE=/^หมวด\s+[\d๐-๙]+/;async function chunkBySections(_0x4e0d9c){const _0x1559ac=_0x4f3251,_0x1b1fff={'Ugejk':function(_0x32ec88,_0x4b6c51){return _0x32ec88(_0x4b6c51);},'pcfmH':function(_0x357eff,_0x4a04a6){return _0x357eff<_0x4a04a6;},'qbLjj':function(_0x3d9e9b,_0x33846c){return _0x3d9e9b<=_0x33846c;},'evGCW':function(_0x425bc2,_0x31de64){return _0x425bc2*_0x31de64;},'OJYFT':function(_0x5e9f96,_0x3dabee){return _0x5e9f96(_0x3dabee);},'wWUKy':function(_0x139aa7,_0x525182){return _0x139aa7+_0x525182;}},_0x2df4a4=normalizeText(_0x4e0d9c),_0xf7ba86=_0x2df4a4[_0x1559ac(0x181,'AnpV')](/(?<!\S)(?=(?:หมวด\s+[\d๐-๙]+|(?:มาตรา|ข้อ)\s+[\d๐-๙]+))/),_0x4c913b=_0xf7ba86[_0x1559ac(0x1ff,'u7VD')](_0x258f41=>_0x258f41[_0x1559ac(0x199,'BwKV')]())['filter'](_0x5c2435=>_0x5c2435[_0x1559ac(0x222,'sS1W')]>0x0),_0x30f8bd=_0x4c913b[_0x1559ac(0x195,'2Akt')](_0x22c717=>/^(?:มาตรา|ข้อ)\s+[\d๐-๙]+/['test'](_0x22c717));if(!_0x30f8bd)return null;const _0x285f70=[];let _0x2950f8=null,_0x211368=null;for(const _0x46ac46 of _0x4c913b){if(CHAPTER_RE[_0x1559ac(0x234,'Hc5G')](_0x46ac46)){_0x2950f8=extractChapter(_0x46ac46),_0x211368=_0x1b1fff['Ugejk'](extractChapterTitle,_0x46ac46);continue;}const _0xf0f7a9=extractSection(_0x46ac46),_0x109e69=_0x46ac46['split'](/\n\n+/)[_0x1559ac(0x163,'&gdT')](_0x5340ec=>_0x5340ec[_0x1559ac(0x1cf,'&TGz')]())[_0x1559ac(0x237,'Rx(M')](_0x3064d0=>_0x3064d0['length']>=0xa);if(_0x109e69[_0x1559ac(0x196,'rTdk')]>0x1){for(let _0x47a5ca=0x0;_0x1b1fff[_0x1559ac(0x1a5,'Rx(M')](_0x47a5ca,_0x109e69[_0x1559ac(0x1db,'%0f3')]);_0x47a5ca++){const _0x572f88=_0x109e69[_0x47a5ca];if(_0x1b1fff[_0x1559ac(0x1ed,'jufK')](_0x572f88[_0x1559ac(0x25f,'6mAj')],_0x1b1fff[_0x1559ac(0x1f6,'KUnX')](CHUNK_SIZE,0x4)))_0x285f70['push']({'text':_0x572f88,'section':_0xf0f7a9,'chapter':_0x2950f8,'chapterTitle':_0x211368,'varakIndex':_0x47a5ca+0x1});else for(const _0x3a5177 of _0x1b1fff[_0x1559ac(0x150,'&gdT')](chunkByTokens,_0x572f88)){_0x285f70[_0x1559ac(0x262,'rTdk')]({'text':_0x3a5177,'section':_0xf0f7a9,'chapter':_0x2950f8,'chapterTitle':_0x211368,'varakIndex':_0x1b1fff[_0x1559ac(0x202,'QaQ#')](_0x47a5ca,0x1)});}}continue;}if(_0x46ac46[_0x1559ac(0x1e4,'i7Kk')]<=CHUNK_SIZE*0x4)_0x285f70[_0x1559ac(0x1a6,'S[06')]({'text':_0x46ac46,'section':_0xf0f7a9,'chapter':_0x2950f8,'chapterTitle':_0x211368,'varakIndex':0x1});else for(const _0x7ae04f of _0x1b1fff[_0x1559ac(0x211,'AwG@')](chunkByTokens,_0x46ac46)){_0x285f70[_0x1559ac(0x256,'oEXD')]({'text':_0x7ae04f,'section':_0xf0f7a9,'chapter':_0x2950f8,'chapterTitle':_0x211368,'varakIndex':0x1});}}return _0x285f70;}function buildEmbedText(_0x41b837,_0x5456e2){const _0x33ab05=_0x4f3251,_0x195827=[_0x33ab05(0x24b,'z[&C')+_0x5456e2];if(_0x41b837[_0x33ab05(0x16b,'KUnX')])_0x195827[_0x33ab05(0x1e9,'AnpV')]('มาตรา:\x20'+_0x41b837['section']);return _0x195827[_0x33ab05(0x17c,'*4V5')](_0x41b837[_0x33ab05(0x1f8,'AwG@')]),_0x195827[_0x33ab05(0x1c3,'Tf43')]('\x0a');}var THAI_CHAR_RE=/[\u0E00-\u0E7F]/g,NON_WHITESPACE_RE=/\S/g;function thaiRatio(_0x21f2c5){const _0xa61599=_0x4f3251;if(!_0x21f2c5)return 0x0;const _0xc85e49=(_0x21f2c5[_0xa61599(0x14e,'z[&C')](THAI_CHAR_RE)??[])[_0xa61599(0x1a7,']6ux')],_0x3a4599=(_0x21f2c5[_0xa61599(0x1b8,'c8pb')](NON_WHITESPACE_RE)??[])[_0xa61599(0x230,'q2mg')];if(_0x3a4599===0x0)return 0x0;return _0xc85e49/_0x3a4599;}var THAI_RATIO={'EMBEDDED':0.3,'OCR':0.2},MIN_THAI_RATIO=THAI_RATIO['EMBEDDED'];function classifyPdf(_0x1d8a03){const _0x79aaf3=_0x4f3251,_0x57fd1a={'Pqcju':function(_0x790359,_0x392899){return _0x790359<_0x392899;}};if(!_0x1d8a03||_0x57fd1a[_0x79aaf3(0x206,'BwKV')](_0x1d8a03[_0x79aaf3(0x165,'QaQ#')]()[_0x79aaf3(0x171,'2TH)')],0x32))return 0x1;if(/หมวด\s*[\d๐-๙]+/[_0x79aaf3(0x176,'sS1W')](_0x1d8a03)&&/(?:มาตรา|ข้อ)\s*[\d๐-๙]+/[_0x79aaf3(0x251,'Xts1')](_0x1d8a03))return 0x2;if(/(?:มาตรา|ข้อ)\s*[\d๐-๙]+/[_0x79aaf3(0x1a0,'uC@)')](_0x1d8a03))return 0x3;return 0x4;}var RETRIABLE_CODES=new Set([_0x4f3251(0x151,'B^T#'),'QDRANT_ERROR']);function classifyError(_0x31138a){const _0xf075b9=_0x4f3251,_0x5f54ee={'jDwWW':_0xf075b9(0x1a8,'ENsj'),'fmWBz':_0xf075b9(0x21a,'fLE0'),'ZrQcu':_0xf075b9(0x16c,'ENsj'),'QTJAX':_0xf075b9(0x1f3,'AwG@'),'FLiOn':_0xf075b9(0x214,'B^T#')},_0x311ecb=_0x31138a?.[_0xf075b9(0x1c2,'(grQ')]??'';if(_0x311ecb['includes'](_0x5f54ee[_0xf075b9(0x205,'%0f3')])||_0x311ecb[_0xf075b9(0x197,'Tf43')]('EMPTY_CHUNKS'))return _0xf075b9(0x19b,'Z(oZ');if(_0x311ecb['includes'](_0xf075b9(0x1e3,'*4V5'))||_0x31138a?.[_0xf075b9(0x168,'0Jbg')]===_0x5f54ee['fmWBz'])return _0xf075b9(0x213,'c8pb');if(_0x311ecb[_0xf075b9(0x22e,'ENsj')](_0x5f54ee[_0xf075b9(0x1b3,'rza)')]))return _0xf075b9(0x1ad,'Xts1');if(_0x311ecb['includes'](_0x5f54ee[_0xf075b9(0x225,'uC@)')])||_0x311ecb['includes'](_0xf075b9(0x178,'sS1W')))return'QDRANT_ERROR';if(_0x311ecb[_0xf075b9(0x264,'KUnX')](_0xf075b9(0x19a,'#sZg'))||_0x311ecb[_0xf075b9(0x1b0,'x!jj')](_0x5f54ee[_0xf075b9(0x16e,'*4V5')]))return _0xf075b9(0x20d,'q2mg');return'UNKNOWN';}function isRetriable(_0x4100db){const _0x50f099=_0x4f3251;return RETRIABLE_CODES[_0x50f099(0x1b9,'(grQ')](_0x4100db);}var MAX_ATTEMPTS=0x3;function shouldRetry(_0x54a5d1,_0x37c5b6){const _0x2a6b82=_0x4f3251,_0x2cbfee={'LcqqX':function(_0x119af9,_0x10f343){return _0x119af9(_0x10f343);},'PwhXz':function(_0x50a593,_0x4cdb24){return _0x50a593<_0x4cdb24;}};return _0x2cbfee[_0x2a6b82(0x254,'ZWcJ')](isRetriable,_0x54a5d1)&&_0x2cbfee[_0x2a6b82(0x157,'u7VD')](_0x37c5b6,MAX_ATTEMPTS);}function retryDelay(_0x23b615){return _0x23b615*0x7530;}typeof globalThis[_0x4f3251(0x1df,'x!jj')]==='undefined'&&(globalThis[_0x4f3251(0x22f,'qJ&]')]=class DOMMatrix{constructor(_0x3b3eab){const _0x26e767=_0x4f3251,_0x533b11={'UmHBG':function(_0x162bd4,_0x43aee5){return _0x162bd4===_0x43aee5;},'oMlMy':function(_0x937e64,_0x38cceb){return _0x937e64===_0x38cceb;},'MjTuX':function(_0xfe1f07,_0x14dd33){return _0xfe1f07===_0x14dd33;},'UXURD':function(_0x24e4de,_0x54a6de){return _0x24e4de===_0x54a6de;},'GEypD':_0x26e767(0x1ea,'PHdi')};this['a']=0x1,this['b']=0x0,this['c']=0x0,this['d']=0x1,this['e']=0x0,this['f']=0x0,this['m11']=0x1,this[_0x26e767(0x243,'i7Kk')]=0x0,this['m13']=0x0,this[_0x26e767(0x24f,'c8pb')]=0x0,this[_0x26e767(0x1f5,'AnpV')]=0x0,this[_0x26e767(0x233,'rTdk')]=0x1,this[_0x26e767(0x1d8,'KUnX')]=0x0,this[_0x26e767(0x1e1,'jufK')]=0x0,this[_0x26e767(0x14c,'u7VD')]=0x0,this[_0x26e767(0x15f,'qJ&]')]=0x0,this[_0x26e767(0x253,'6mAj')]=0x1,this[_0x26e767(0x255,'x!jj')]=0x0,this['m41']=0x0,this[_0x26e767(0x188,'2TH)')]=0x0,this['m43']=0x0,this[_0x26e767(0x231,'oqoT')]=0x1,this['is2D']=!![],this[_0x26e767(0x228,'&gdT')]=!![];if(Array[_0x26e767(0x1e8,'ZWcJ')](_0x3b3eab)&&_0x533b11[_0x26e767(0x1c0,'2TH)')](_0x3b3eab['length'],0x6)){const _0x38e62f=_0x26e767(0x227,'rza)')[_0x26e767(0x175,'B^T#')]('|');let _0x248962=0x0;while(!![]){switch(_0x38e62f[_0x248962++]){case'0':this[_0x26e767(0x250,'J9x(')]=this['e'];continue;case'1':this[_0x26e767(0x1c6,'&gdT')]=this['c'];continue;case'2':this['m42']=this['f'];continue;case'3':this['isIdentity']=_0x533b11[_0x26e767(0x249,'x!jj')](this['a'],0x1)&&_0x533b11['MjTuX'](this['b'],0x0)&&_0x533b11[_0x26e767(0x23e,'AnpV')](this['c'],0x0)&&this['d']===0x1&&_0x533b11['UXURD'](this['e'],0x0)&&this['f']===0x0;continue;case'4':[this['a'],this['b'],this['c'],this['d'],this['e'],this['f']]=_0x3b3eab;continue;case'5':this[_0x26e767(0x1e2,'Z(oZ')]=this['b'];continue;case'6':this[_0x26e767(0x20c,'KUnX')]=this['a'];continue;case'7':this[_0x26e767(0x233,'rTdk')]=this['d'];continue;}break;}}else{if(typeof _0x3b3eab===_0x533b11[_0x26e767(0x154,'Rx(M')]&&_0x3b3eab!==''){const _0x135675=_0x3b3eab[_0x26e767(0x170,'AnpV')](/matrix\(|\)/g,'')[_0x26e767(0x207,'Xts1')](',')[_0x26e767(0x190,'P1ZY')](Number);if(_0x533b11[_0x26e767(0x22a,'Z(oZ')](_0x135675[_0x26e767(0x16d,'BwKV')],0x6)){const _0x434fc7='0|4|6|2|5|1|3'[_0x26e767(0x1d5,'6mAj')]('|');let _0x4b8299=0x0;while(!![]){switch(_0x434fc7[_0x4b8299++]){case'0':[this['a'],this['b'],this['c'],this['d'],this['e'],this['f']]=_0x135675;continue;case'1':this['m41']=this['e'];continue;case'2':this[_0x26e767(0x194,'ENsj')]=this['c'];continue;case'3':this[_0x26e767(0x1b4,'&gdT')]=this['f'];continue;case'4':this['m11']=this['a'];continue;case'5':this['m22']=this['d'];continue;case'6':this[_0x26e767(0x200,'ENsj')]=this['b'];continue;}break;}}}}}[_0x4f3251(0x246,'fLE0')](_0x41507f){const _0x5f243e=_0x4f3251,_0x2c37f8={'EVbeX':function(_0x32ddbf,_0x5b3c53){return _0x32ddbf*_0x5b3c53;},'jxrfS':function(_0x8ce785,_0x879473){return _0x8ce785+_0x879473;},'zkIhO':function(_0x5c4d96,_0xd3190d){return _0x5c4d96*_0xd3190d;}};return new DOMMatrix([this['a']*_0x41507f['a']+_0x2c37f8[_0x5f243e(0x242,'jufK')](this['c'],_0x41507f['b']),_0x2c37f8['jxrfS'](this['b']*_0x41507f['a'],this['d']*_0x41507f['b']),_0x2c37f8[_0x5f243e(0x265,'AwG@')](this['a'],_0x41507f['c'])+this['c']*_0x41507f['d'],this['b']*_0x41507f['c']+_0x2c37f8[_0x5f243e(0x1dd,'QaQ#')](this['d'],_0x41507f['d']),_0x2c37f8[_0x5f243e(0x1a3,'^3bn')](this['a']*_0x41507f['e']+_0x2c37f8[_0x5f243e(0x148,'2TH)')](this['c'],_0x41507f['f']),this['e']),_0x2c37f8['jxrfS'](_0x2c37f8[_0x5f243e(0x1ca,'sS1W')](this['b'],_0x41507f['e'])+this['d']*_0x41507f['f'],this['f'])]);}[_0x4f3251(0x23f,'fLE0')](_0x3a6636=0x1,_0x49161e=_0x3a6636,_0x29ae91=0x1,_0x5ccda7=0x0,_0x15d123=0x0){const _0xb8c396=_0x4f3251,_0x19e7a4={'smkuS':function(_0x42051f,_0x426e2f){return _0x42051f*_0x426e2f;},'RIflm':function(_0x25a9f2,_0x684bb5){return _0x25a9f2*_0x684bb5;},'MEjCa':function(_0x186e38,_0x110f23){return _0x186e38+_0x110f23;},'JIhhI':function(_0x2016b7,_0x2e7fb4){return _0x2016b7*_0x2e7fb4;},'IuLJX':function(_0x511d8e,_0x53ec3d){return _0x511d8e*_0x53ec3d;},'CoTWW':function(_0x3fd9c2,_0x1c919a){return _0x3fd9c2*_0x1c919a;},'uEZII':function(_0x2c8907,_0x5e5ea9){return _0x2c8907*_0x5e5ea9;},'QZuQF':function(_0x692a7d,_0xa41061){return _0x692a7d*_0xa41061;},'KXpef':function(_0xea9166,_0x3a0b58){return _0xea9166*_0x3a0b58;}};return new DOMMatrix([_0x19e7a4[_0xb8c396(0x248,'(grQ')](this['a'],_0x3a6636),this['b']*_0x3a6636,this['c']*_0x49161e,this['d']*_0x49161e,_0x19e7a4[_0xb8c396(0x184,'B2Lf')](this['a'],_0x19e7a4[_0xb8c396(0x263,'AwG@')](_0x19e7a4['RIflm'](_0x3a6636,-_0x5ccda7),_0x5ccda7))+_0x19e7a4[_0xb8c396(0x1ce,'6mAj')](this['c'],_0x19e7a4[_0xb8c396(0x1a4,'Hc5G')](_0x19e7a4[_0xb8c396(0x16f,'0Jbg')](_0x49161e,-_0x15d123),_0x15d123))+this['e'],_0x19e7a4[_0xb8c396(0x189,'rza)')](_0x19e7a4[_0xb8c396(0x229,'PHdi')](this['b'],_0x19e7a4[_0xb8c396(0x241,'u7VD')](_0x19e7a4['uEZII'](_0x3a6636,-_0x5ccda7),_0x5ccda7))+_0x19e7a4[_0xb8c396(0x24e,'v2)a')](this['d'],_0x19e7a4['MEjCa'](_0x19e7a4[_0xb8c396(0x245,'HzW3')](_0x49161e,-_0x15d123),_0x15d123)),this['f'])]);}[_0x4f3251(0x269,'AwG@')](_0x5a71d4=0x1,_0x47bc6a=0x1){const _0x41d7b3=_0x4f3251;return this[_0x41d7b3(0x25a,'BwKV')](_0x5a71d4,_0x47bc6a);}['translate'](_0x1c4aad=0x0,_0x13fe2b=0x0){const _0x148edd=_0x4f3251,_0x39ab84={'EOYlV':function(_0x53986a,_0x534f64){return _0x53986a*_0x534f64;},'AGhdY':function(_0x119e92,_0x4b6cbc){return _0x119e92*_0x4b6cbc;}};return new DOMMatrix([this['a'],this['b'],this['c'],this['d'],this['a']*_0x1c4aad+_0x39ab84['EOYlV'](this['c'],_0x13fe2b)+this['e'],_0x39ab84[_0x148edd(0x18b,'AwG@')](this['b'],_0x1c4aad)+this['d']*_0x13fe2b+this['f']]);}[_0x4f3251(0x1d4,'Hc5G')](_0x42f2ae=0x0){const _0x416b42=_0x4f3251,_0x2f16b0={'EqqSF':function(_0xb4e617,_0x4b29fc){return _0xb4e617*_0x4b29fc;}},_0x33b9b3=_0x2f16b0['EqqSF'](_0x42f2ae,Math['PI'])/0xb4,_0x571d10=Math[_0x416b42(0x172,'Tf43')](_0x33b9b3),_0x3162dd=Math[_0x416b42(0x1d0,'rTdk')](_0x33b9b3);return this[_0x416b42(0x19c,'ZWcJ')](new DOMMatrix([_0x571d10,_0x3162dd,-_0x3162dd,_0x571d10,0x0,0x0]));}[_0x4f3251(0x149,'v2)a')](){const _0x3b7c58=_0x4f3251,_0x273490={'AHoWG':function(_0x58fce1,_0x4b693f){return _0x58fce1*_0x4b693f;},'OcYiW':function(_0x580e8b,_0x3e677d){return _0x580e8b/_0x3e677d;},'oSmJY':function(_0xdebac1,_0x3c32e3){return _0xdebac1/_0x3c32e3;},'IHOnn':function(_0xb293d6,_0x223952){return _0xb293d6/_0x223952;},'CWtXY':function(_0x3cfa91,_0x20554b){return _0x3cfa91-_0x20554b;},'RmjiV':function(_0xf71f5c,_0x31c039){return _0xf71f5c*_0x31c039;},'tTxSb':function(_0x24a2b9,_0x1ee8e9){return _0x24a2b9*_0x1ee8e9;},'yYLTj':function(_0x229173,_0x5a20cc){return _0x229173*_0x5a20cc;}},_0x4da4f0=_0x273490['AHoWG'](this['a'],this['d'])-_0x273490[_0x3b7c58(0x183,'q2mg')](this['b'],this['c']);if(Math[_0x3b7c58(0x238,'rTdk')](_0x4da4f0)<1e-10)return new DOMMatrix();return new DOMMatrix([this['d']/_0x4da4f0,_0x273490[_0x3b7c58(0x25c,'oEXD')](-this['b'],_0x4da4f0),_0x273490[_0x3b7c58(0x156,'uC@)')](-this['c'],_0x4da4f0),_0x273490[_0x3b7c58(0x23c,'&gdT')](this['a'],_0x4da4f0),_0x273490[_0x3b7c58(0x1f4,'I!IE')](_0x273490[_0x3b7c58(0x16a,'QaQ#')](this['c'],this['f']),_0x273490[_0x3b7c58(0x1da,'HzW3')](this['d'],this['e']))/_0x4da4f0,_0x273490[_0x3b7c58(0x1ef,'AwG@')](this['b']*this['e'],_0x273490['yYLTj'](this['a'],this['f']))/_0x4da4f0]);}[_0x4f3251(0x1be,'qJ&]')](_0x31dcb7={}){const _0x945dfc=_0x4f3251,_0x3d6cda={'lWBvN':function(_0x2263df,_0x53e147){return _0x2263df+_0x53e147;},'LJogb':function(_0xf2fcb1,_0x4256ad){return _0xf2fcb1*_0x4256ad;},'lAQIc':function(_0x53ec72,_0x48a73c){return _0x53ec72*_0x48a73c;},'SDzTL':function(_0x1fcf6a,_0x1b32ba){return _0x1fcf6a*_0x1b32ba;}},_0x3560a7=_0x31dcb7['x']??0x0,_0x4d5cfc=_0x31dcb7['y']??0x0;return{'x':_0x3d6cda[_0x945dfc(0x239,'uC@)')](_0x3d6cda['LJogb'](this['a'],_0x3560a7)+_0x3d6cda['lAQIc'](this['c'],_0x4d5cfc),this['e']),'y':_0x3d6cda['LJogb'](this['b'],_0x3560a7)+_0x3d6cda[_0x945dfc(0x1fd,'2Akt')](this['d'],_0x4d5cfc)+this['f'],'z':0x0,'w':0x1};}[_0x4f3251(0x1cb,'QaQ#')](){return this['multiply'](new DOMMatrix([-0x1,0x0,0x0,0x1,0x0,0x0]));}['flipY'](){const _0xe9741f=_0x4f3251;return this[_0xe9741f(0x158,'%0f3')](new DOMMatrix([0x1,0x0,0x0,-0x1,0x0,0x0]));}['skewX'](_0x4eedc5=0x0){const _0x35115f=_0x4f3251;return this[_0x35115f(0x260,'AwG@')](new DOMMatrix([0x1,0x0,Math[_0x35115f(0x15e,'rza)')](_0x4eedc5*Math['PI']/0xb4),0x1,0x0,0x0]));}[_0x4f3251(0x17b,'KUnX')](_0x59a084=0x0){const _0x47eab1=_0x4f3251,_0x5c77af={'VEyPP':function(_0x5efb5d,_0x3ca900){return _0x5efb5d*_0x3ca900;}};return this['multiply'](new DOMMatrix([0x1,Math[_0x47eab1(0x208,'oqoT')](_0x5c77af[_0x47eab1(0x1a2,']6ux')](_0x59a084,Math['PI'])/0xb4),0x0,0x1,0x0,0x0]));}['toFloat32Array'](){const _0x51adc7=_0x4f3251;return new Float32Array([this[_0x51adc7(0x1c5,'&TGz')],this[_0x51adc7(0x19e,'KUnX')],this[_0x51adc7(0x21e,'P1ZY')],this[_0x51adc7(0x159,'u7VD')],this[_0x51adc7(0x240,'ZWcJ')],this[_0x51adc7(0x21f,'(grQ')],this[_0x51adc7(0x1ab,'oEXD')],this[_0x51adc7(0x204,'B2Lf')],this[_0x51adc7(0x1a1,'ENsj')],this[_0x51adc7(0x187,'oEXD')],this[_0x51adc7(0x1a9,'^3bn')],this['m34'],this[_0x51adc7(0x1eb,'2TH)')],this['m42'],this[_0x51adc7(0x1af,'x!jj')],this['m44']]);}['toFloat64Array'](){const _0x26bcc1=_0x4f3251;return new Float64Array([this[_0x26bcc1(0x17e,'P1ZY')],this['m12'],this[_0x26bcc1(0x14f,'i7Kk')],this[_0x26bcc1(0x26a,'sS1W')],this[_0x26bcc1(0x182,'&TGz')],this[_0x26bcc1(0x147,'oEXD')],this[_0x26bcc1(0x24c,'u7VD')],this['m24'],this[_0x26bcc1(0x14c,'u7VD')],this[_0x26bcc1(0x1d9,'x!jj')],this[_0x26bcc1(0x224,']6ux')],this[_0x26bcc1(0x235,'^3bn')],this[_0x26bcc1(0x1ee,'PHdi')],this[_0x26bcc1(0x24a,'qJ&]')],this[_0x26bcc1(0x162,'BwKV')],this[_0x26bcc1(0x1f1,'6mAj')]]);}[_0x4f3251(0x1b5,'E^Gu')](){const _0x414d9f=_0x4f3251;return _0x414d9f(0x1f7,'rza)')+this['a']+',\x20'+this['b']+',\x20'+this['c']+',\x20'+this['d']+',\x20'+this['e']+',\x20'+this['f']+')';}static[_0x4f3251(0x244,'I!IE')](_0x951540={}){return new DOMMatrix([_0x951540['a']??0x1,_0x951540['b']??0x0,_0x951540['c']??0x0,_0x951540['d']??0x1,_0x951540['e']??0x0,_0x951540['f']??0x0]);}static[_0x4f3251(0x223,'Z(oZ')](_0x2dcced){return new DOMMatrix([..._0x2dcced]);}static[_0x4f3251(0x161,'J9x(')](_0x2b7481){return new DOMMatrix([..._0x2b7481]);}});typeof globalThis[_0x4f3251(0x15a,'8It*')]===_0x4f3251(0x1bf,'S[06')&&(globalThis['DOMPoint']=class DOMPoint{constructor(_0x5e378d=0x0,_0xa333a8=0x0,_0x62d9f7=0x0,_0x47eb3f=0x1){this['x']=_0x5e378d,this['y']=_0xa333a8,this['z']=_0x62d9f7,this['w']=_0x47eb3f;}static[_0x4f3251(0x192,'8It*')](_0x285293={}){return new DOMPoint(_0x285293['x']??0x0,_0x285293['y']??0x0,_0x285293['z']??0x0,_0x285293['w']??0x1);}});import*as _0x262d9f from'pdfjs-dist/legacy/build/pdf.mjs';var INDENT_THRESHOLD=0x50,KRISADIKA_WM='สํานักงานคณะกรรมการกฤษฎีกา',SPACE_GAP_THRESHOLD=0x2;async function extractPdfText(_0x32c6fe){const _0x33a54b=_0x4f3251,_0x29c478={'OuMDg':function(_0xe18d99,_0x3a8d1b){return _0xe18d99-_0x3a8d1b;},'PuyUw':function(_0x499ecb,_0x2c7545){return _0x499ecb<=_0x2c7545;},'AmZQF':function(_0x17b0fa,_0x57030b){return _0x17b0fa===_0x57030b;},'BQZEk':function(_0x36b595,_0x289de2){return _0x36b595-_0x289de2;},'pxoyK':function(_0x149379,_0x5cbc22){return _0x149379+_0x5cbc22;},'dNEZy':function(_0x36395c,_0x57a7a9){return _0x36395c>_0x57a7a9;}},_0x5cad57=new Uint8Array(_0x32c6fe[_0x33a54b(0x1fb,'rza)')]??_0x32c6fe),_0x547497=await _0x262d9f[_0x33a54b(0x1fc,'i7Kk')]({'data':_0x5cad57})[_0x33a54b(0x20f,'c8pb')],_0x1d093d=[];for(let _0x5501f0=0x1;_0x29c478[_0x33a54b(0x1c9,'qJ&]')](_0x5501f0,_0x547497[_0x33a54b(0x1e7,'AnpV')]);_0x5501f0++){const _0x24b018=await _0x547497['getPage'](_0x5501f0),_0x4d2b59=await _0x24b018[_0x33a54b(0x173,'#sZg')](),_0x35f024=[..._0x4d2b59['items']][_0x33a54b(0x25d,'I!IE')]((_0x5c1d11,_0x930b2d)=>{const _0x2b11bb=_0x33a54b,_0x41999e=_0x29c478[_0x2b11bb(0x1c8,'Hc5G')](_0x930b2d[_0x2b11bb(0x257,'BwKV')][0x5],_0x5c1d11[_0x2b11bb(0x1e5,'rza)')][0x5]);if(Math['abs'](_0x41999e)>0x2)return _0x41999e;return _0x5c1d11[_0x2b11bb(0x212,'Tf43')][0x4]-_0x930b2d[_0x2b11bb(0x26b,'Rx(M')][0x4];}),_0x3ece19=[];for(const _0x4c6a4e of _0x35f024){if(!_0x4c6a4e[_0x33a54b(0x1e0,'S[06')]?.['trim']())continue;if(_0x29c478[_0x33a54b(0x1bc,'E^Gu')](_0x4c6a4e['str']['trim'](),KRISADIKA_WM))continue;const _0x4a4df1=_0x4c6a4e[_0x33a54b(0x257,'BwKV')][0x4],_0x35026b=_0x4c6a4e[_0x33a54b(0x247,'QaQ#')][0x5],_0x592105=_0x3ece19[_0x3ece19['length']-0x1];if(!_0x592105||Math[_0x33a54b(0x252,'u7VD')](_0x592105['y']-_0x35026b)>0x2)_0x3ece19[_0x33a54b(0x1dc,'Rx(M')]({'y':_0x35026b,'firstX':_0x4a4df1,'lastX':_0x4a4df1,'lastWidth':_0x4c6a4e['width']??0x0,'parts':[_0x4c6a4e[_0x33a54b(0x18a,'0Jbg')]]});else{const _0x2fd89e=_0x29c478[_0x33a54b(0x15b,'z[&C')](_0x4a4df1,_0x29c478['pxoyK'](_0x592105[_0x33a54b(0x153,'Rx(M')],_0x592105[_0x33a54b(0x18f,'c8pb')])),_0x1bb24a=_0x2fd89e>SPACE_GAP_THRESHOLD?'\x20':'';_0x592105[_0x33a54b(0x226,'B^T#')][_0x33a54b(0x15c,'z[&C')](_0x29c478[_0x33a54b(0x261,'oEXD')](_0x1bb24a,_0x4c6a4e[_0x33a54b(0x1d7,'x!jj')])),_0x592105['lastX']=_0x4a4df1,_0x592105[_0x33a54b(0x23a,'sS1W')]=_0x4c6a4e[_0x33a54b(0x179,'*4V5')]??0x0;}}for(const _0x444af3 of _0x3ece19){const _0x123079=_0x444af3[_0x33a54b(0x1c7,'(grQ')][_0x33a54b(0x25b,'u7VD')]('')[_0x33a54b(0x21c,'B^T#')]();if(!_0x123079)continue;_0x1d093d[_0x33a54b(0x19f,'B^T#')]({'text':_0x123079,'isIndented':_0x29c478[_0x33a54b(0x17d,'B2Lf')](_0x444af3[_0x33a54b(0x258,'oqoT')],INDENT_THRESHOLD)});}}const _0x4bd3d8=[];for(let _0x13f1ff=0x0;_0x13f1ff<_0x1d093d[_0x33a54b(0x17a,'c8pb')];_0x13f1ff++){const {text:_0x51e147,isIndented:_0x96dc6}=_0x1d093d[_0x13f1ff];if(_0x13f1ff===0x0)_0x4bd3d8[_0x33a54b(0x1fe,'KUnX')](_0x51e147);else _0x96dc6?_0x4bd3d8[_0x33a54b(0x15d,'QaQ#')]('\x0a\x0a'+_0x51e147):_0x4bd3d8[_0x33a54b(0x203,'Xts1')]('\x0a'+_0x51e147);}return _0x4bd3d8[_0x33a54b(0x191,'ZWcJ')]('');}function sanitize(_0x3c373e){const _0x4ee084=_0x4f3251;if(!_0x3c373e)return _0x3c373e;return _0x3c373e[_0x4ee084(0x1ba,'HzW3')](/[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\u00AD\u200B-\u200F\uFEFF]/g,'');}var VARAK_START_RE=/^(การ|ให้|ใน|รัฐ|กฎหมาย|บุคคล|เมื่อ|ถ้า|ทั้งนี้|ในกรณี|โดย|หาก|เว้นแต่|ห้าม|ต้อง|แต่ถ้า|สมาชิก|คณะกรรมการ|องค์กร)/,MIN_VARAK_LEN=0xf,MIN_VARAK_TRIGGER_LEN=0x28;function toLines(_0x827f9e){const _0x360f23=_0x4f3251;return _0x827f9e[_0x360f23(0x1d6,'v2)a')]('\x0a')[_0x360f23(0x167,'B^T#')](_0x63df27=>_0x63df27[_0x360f23(0x18c,'ZWcJ')]()[_0x360f23(0x180,'x!jj')]>0x0);}function segmentVarakByRules(_0x2a8ac6){const _0x58ee36=_0x4f3251,_0x12bc2a={'dQQGH':function(_0x22df16,_0x571b7a){return _0x22df16<_0x571b7a;}},_0x59ed28=toLines(_0x2a8ac6);if(_0x59ed28[_0x58ee36(0x196,'rTdk')]>0x1){const _0x3bb5a1=[];let _0x311b77=[_0x59ed28[0x0]];for(let _0x5ab936=0x1;_0x12bc2a[_0x58ee36(0x1b6,'8It*')](_0x5ab936,_0x59ed28[_0x58ee36(0x164,'QaQ#')]);_0x5ab936++){const _0x210b2e=_0x59ed28[_0x5ab936]['trim']();if(!_0x59ed28[_0x5ab936][_0x58ee36(0x22c,'B^T#')]('\x20')&&VARAK_START_RE['test'](_0x210b2e)&&_0x210b2e[_0x58ee36(0x18e,'&gdT')]>=MIN_VARAK_TRIGGER_LEN){const _0x13bc18=_0x311b77['join']('\x0a')[_0x58ee36(0x1f2,'E^Gu')]();if(_0x13bc18[_0x58ee36(0x196,'rTdk')]>=MIN_VARAK_LEN)_0x3bb5a1[_0x58ee36(0x19d,'q2mg')](_0x13bc18);_0x311b77=[_0x59ed28[_0x5ab936]];}else _0x311b77[_0x58ee36(0x152,'HzW3')](_0x59ed28[_0x5ab936]);}const _0x30db38=_0x311b77[_0x58ee36(0x174,'Rx(M')]('\x0a')[_0x58ee36(0x1cf,'&TGz')]();if(_0x30db38['length']>=MIN_VARAK_LEN)_0x3bb5a1['push'](_0x30db38);if(_0x3bb5a1[_0x58ee36(0x25e,'E^Gu')]>0x1)return _0x3bb5a1;}const _0x248688=_0x2a8ac6['split'](/\n\n+/)[_0x58ee36(0x22d,'I!IE')](_0x3a8d37=>_0x3a8d37['trim']())['filter'](_0x654014=>_0x654014[_0x58ee36(0x18d,'oqoT')]>=MIN_VARAK_LEN);if(_0x248688[_0x58ee36(0x171,'2TH)')]>0x1)return _0x248688;return[_0x2a8ac6['trim']()][_0x58ee36(0x14a,'Z(oZ')](_0x13de95=>_0x13de95[_0x58ee36(0x1bd,'KUnX')]>0x0);}export{MAX_ATTEMPTS,THAI_RATIO,buildEmbedText,chunkByParagraphs,chunkBySections,chunkByTokens,classifyError,classifyPdf,extractChapter,extractChapterTitle,extractPdfText,normalizeDigits,normalizeText,retryDelay,sanitize,segmentVarakByRules,shouldRetry,thaiRatio};
|
package/index.d.ts
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
// ─── thai-legal-chunker — TypeScript Declarations ────────────────
|
|
2
|
+
// Types only — no implementation logic.
|
|
3
|
+
|
|
4
|
+
// ─── Shared Types ─────────────────────────────────────────────────
|
|
5
|
+
|
|
6
|
+
/** A single semantic chunk produced by chunkBySections() */
|
|
7
|
+
export interface Chunk {
|
|
8
|
+
/** Raw text content of this chunk */
|
|
9
|
+
text: string;
|
|
10
|
+
/** Section label e.g. "มาตรา 252" — null if document has no section headers */
|
|
11
|
+
section: string | null;
|
|
12
|
+
/** Chapter label e.g. "หมวด 3" — null if no chapter found */
|
|
13
|
+
chapter: string | null;
|
|
14
|
+
/** Chapter title e.g. "การบริหารงานบุคคล" — null if not present */
|
|
15
|
+
chapterTitle: string | null;
|
|
16
|
+
/** 1-based วรรค index within the section (1 = first วรรค) */
|
|
17
|
+
varakIndex: number;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
/** A single วรรค segment produced by segmentVarakByRules() */
|
|
21
|
+
export interface VarakSegment {
|
|
22
|
+
/** 1-based index */
|
|
23
|
+
varakIndex: number;
|
|
24
|
+
/** Text content of this วรรค */
|
|
25
|
+
text: string;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/** Thai ratio thresholds used by thaiRatio() gate */
|
|
29
|
+
export interface ThaiRatioThresholds {
|
|
30
|
+
/** Minimum Thai ratio for embedded-text PDFs (default 0.30) */
|
|
31
|
+
EMBEDDED: number;
|
|
32
|
+
/** Minimum Thai ratio for OCR output (default 0.20) */
|
|
33
|
+
OCR: number;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// ─── normalize ────────────────────────────────────────────────────
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Replace Thai digits (๐–๙) with Arabic digits (0–9).
|
|
40
|
+
* e.g. "มาตรา ๒๕๒" → "มาตรา 252"
|
|
41
|
+
*/
|
|
42
|
+
export function normalizeDigits(text: string): string;
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Extract chapter label from the start of a text string.
|
|
46
|
+
* Returns "หมวด 3" style string, or null if not found.
|
|
47
|
+
*/
|
|
48
|
+
export function extractChapter(text: string): string | null;
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Extract chapter title (the name after the chapter number).
|
|
52
|
+
* Returns the title string, or null if not found.
|
|
53
|
+
*/
|
|
54
|
+
export function extractChapterTitle(text: string): string | null;
|
|
55
|
+
|
|
56
|
+
// ─── chunking ─────────────────────────────────────────────────────
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Strip watermarks, gazette page headers, OCR spacing artifacts,
|
|
60
|
+
* and lone surrogates from raw PDF text.
|
|
61
|
+
*/
|
|
62
|
+
export function normalizeText(text: string): string;
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Main chunking pipeline: raw PDF text → semantic Chunk array.
|
|
66
|
+
* Splits by มาตรา/ข้อ/หมวด headers and further by วรรค boundaries.
|
|
67
|
+
*/
|
|
68
|
+
export function chunkBySections(rawText: string): Promise<Chunk[]>;
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Fallback chunker: split text by token count with overlap.
|
|
72
|
+
* Returns plain string array (no metadata).
|
|
73
|
+
*/
|
|
74
|
+
export function chunkByTokens(text: string, maxChars?: number): string[];
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Paragraph-based chunker: split on \n\n boundaries.
|
|
78
|
+
* Used for OCR output that lacks structured section headers.
|
|
79
|
+
* Returns plain string array.
|
|
80
|
+
*/
|
|
81
|
+
export function chunkByParagraphs(text: string, maxChars?: number): string[];
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Build embedding input by prepending contextual header to a chunk.
|
|
85
|
+
* e.g. "เอกสาร: foo.pdf\nมาตรา: 252\n<chunk text>"
|
|
86
|
+
*/
|
|
87
|
+
export function buildEmbedText(chunk: Chunk, filename: string): string;
|
|
88
|
+
|
|
89
|
+
// ─── pdf-classify ─────────────────────────────────────────────────
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Fraction of Thai characters in the text (0–1).
|
|
93
|
+
* Used to detect garbage / non-Thai PDFs before embedding.
|
|
94
|
+
*/
|
|
95
|
+
export function thaiRatio(text: string): number;
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Classify PDF by content type:
|
|
99
|
+
* - 1 = no extractable text (image-only → needs OCR)
|
|
100
|
+
* - 2 = structured legal text (มาตรา + หมวด headers)
|
|
101
|
+
* - 3 = legal text without chapters (มาตรา only)
|
|
102
|
+
* - 4 = general text (no legal structure)
|
|
103
|
+
*/
|
|
104
|
+
export function classifyPdf(text: string): 1 | 2 | 3 | 4;
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Map an Error to a typed error code string.
|
|
108
|
+
* e.g. "EMBED_TIMEOUT", "EMBED_BAD_INPUT", "NO_TEXT", "UNKNOWN"
|
|
109
|
+
*/
|
|
110
|
+
export function classifyError(err: unknown): string;
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* Whether a job with this error code should be retried.
|
|
114
|
+
*/
|
|
115
|
+
export function shouldRetry(code: string, attempt: number): boolean;
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Milliseconds to wait before attempt N (exponential backoff).
|
|
119
|
+
*/
|
|
120
|
+
export function retryDelay(attempt: number): number;
|
|
121
|
+
|
|
122
|
+
/** Thai ratio thresholds (EMBEDDED: 0.30, OCR: 0.20) */
|
|
123
|
+
export const THAI_RATIO: ThaiRatioThresholds;
|
|
124
|
+
|
|
125
|
+
/** Maximum number of ingest attempts before giving up */
|
|
126
|
+
export const MAX_ATTEMPTS: number;
|
|
127
|
+
|
|
128
|
+
// ─── pdf-extractor ────────────────────────────────────────────────
|
|
129
|
+
|
|
130
|
+
/**
|
|
131
|
+
* Extract text from a PDF buffer using coordinate-aware paragraph detection.
|
|
132
|
+
* Preserves วรรค boundaries via indentation analysis (pdfjs-dist).
|
|
133
|
+
* Returns text with \n\n between paragraphs, \n within paragraphs.
|
|
134
|
+
*/
|
|
135
|
+
export function extractPdfText(buffer: Buffer | Uint8Array): Promise<string>;
|
|
136
|
+
|
|
137
|
+
// ─── sanitize ─────────────────────────────────────────────────────
|
|
138
|
+
|
|
139
|
+
/**
|
|
140
|
+
* Strip lone surrogates, zero-width spaces, soft hyphens,
|
|
141
|
+
* and C0/C1 control characters from a string.
|
|
142
|
+
* Safe to call on any untrusted input (LINE messages, OCR output).
|
|
143
|
+
*/
|
|
144
|
+
export function sanitize(str: string): string;
|
|
145
|
+
export function sanitize(str: null): null;
|
|
146
|
+
export function sanitize(str: undefined): undefined;
|
|
147
|
+
|
|
148
|
+
// ─── varak-segmenter ──────────────────────────────────────────────
|
|
149
|
+
|
|
150
|
+
/**
|
|
151
|
+
* Split a legal section text into วรรค segments using keyword boundaries
|
|
152
|
+
* (วรรคหนึ่ง, วรรคสอง, … or \n\n fallback).
|
|
153
|
+
* Returns array of { varakIndex, text } sorted by varakIndex.
|
|
154
|
+
*/
|
|
155
|
+
export function segmentVarakByRules(sectionText: string): VarakSegment[];
|
package/package.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "varak-chunker",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Thai legal document chunking & OCR pipeline for RAG systems",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "dist/index.js",
|
|
7
|
+
"types": "index.d.ts",
|
|
8
|
+
"peerDependencies": {
|
|
9
|
+
"pdfjs-dist": "^4.0.0"
|
|
10
|
+
},
|
|
11
|
+
"license": "UNLICENSED",
|
|
12
|
+
"repository": {
|
|
13
|
+
"type": "git",
|
|
14
|
+
"url": "https://github.com/chalermchaic/varak-chunker-dist.git"
|
|
15
|
+
}
|
|
16
|
+
}
|