@intuned/browser-dev 0.1.4-dev.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (212) hide show
  1. package/.babelrc +21 -0
  2. package/.eslintignore +10 -0
  3. package/.eslintrc.js +39 -0
  4. package/BROWSER_SCRIPTS_SETUP.md +84 -0
  5. package/LICENSE +43 -0
  6. package/README.md +160 -0
  7. package/RELEASE.md +60 -0
  8. package/dist/ai/export.d.js +5 -0
  9. package/dist/ai/export.d.ts +641 -0
  10. package/dist/ai/extractStructuredData.js +320 -0
  11. package/dist/ai/extractStructuredDataUsingAi.js +142 -0
  12. package/dist/ai/extractionHelpers/screenshotHelpers.js +56 -0
  13. package/dist/ai/extractionHelpers/validateSchema.js +148 -0
  14. package/dist/ai/index.d.ts +641 -0
  15. package/dist/ai/index.js +19 -0
  16. package/dist/ai/isPageLoaded.js +80 -0
  17. package/dist/ai/prompt.js +39 -0
  18. package/dist/ai/tests/testCheckAllTypesAreStrings.spec.js +137 -0
  19. package/dist/ai/tests/testExtractFromContent.spec.js +372 -0
  20. package/dist/ai/tests/testExtractStructuredData.spec.js +646 -0
  21. package/dist/ai/tests/testIsPageLoaded.spec.js +277 -0
  22. package/dist/ai/tools/index.js +48 -0
  23. package/dist/ai/types/errors.js +67 -0
  24. package/dist/ai/types/models.js +45 -0
  25. package/dist/ai/types/types.js +48 -0
  26. package/dist/ai/validators.js +167 -0
  27. package/dist/common/Logger/index.js +60 -0
  28. package/dist/common/Logger/types.js +5 -0
  29. package/dist/common/SdkError.js +50 -0
  30. package/dist/common/aiModelsValidations.js +32 -0
  31. package/dist/common/ensureBrowserScripts.js +14 -0
  32. package/dist/common/extendedTest.js +157 -0
  33. package/dist/common/extractionHelpers.js +19 -0
  34. package/dist/common/formatZodError.js +18 -0
  35. package/dist/common/fuzzySearch/fuzzySearch.test.js +250 -0
  36. package/dist/common/fuzzySearch/levenshtein-search.js +298 -0
  37. package/dist/common/fuzzySearch/utils.js +23 -0
  38. package/dist/common/getModelProvider.js +18 -0
  39. package/dist/common/getSimplifiedHtml.js +122 -0
  40. package/dist/common/hashObject.js +32 -0
  41. package/dist/common/html2markdown/convertElementToMarkdown.js +469 -0
  42. package/dist/common/html2markdown/index.js +19 -0
  43. package/dist/common/jwtTokenManager.js +57 -0
  44. package/dist/common/loadRuntime.js +16 -0
  45. package/dist/common/locatorHelpers.js +41 -0
  46. package/dist/common/matching/collectStrings.js +32 -0
  47. package/dist/common/matching/levenshtein.js +40 -0
  48. package/dist/common/matching/matching.js +317 -0
  49. package/dist/common/matching/types.js +1 -0
  50. package/dist/common/noEmpty.js +9 -0
  51. package/dist/common/saveSnapshotWithExamples.js +60 -0
  52. package/dist/common/script.js +2602 -0
  53. package/dist/common/tests/testEnsureBrowserScript.spec.js +31 -0
  54. package/dist/common/xpathMapping.js +107 -0
  55. package/dist/helpers/clickUntilExhausted.js +85 -0
  56. package/dist/helpers/downloadFile.js +125 -0
  57. package/dist/helpers/export.d.js +5 -0
  58. package/dist/helpers/export.d.ts +1220 -0
  59. package/dist/helpers/extractMarkdown.js +35 -0
  60. package/dist/helpers/filterEmptyValues.js +54 -0
  61. package/dist/helpers/gotoUrl.js +98 -0
  62. package/dist/helpers/index.d.ts +1220 -0
  63. package/dist/helpers/index.js +122 -0
  64. package/dist/helpers/processDate.js +25 -0
  65. package/dist/helpers/resolveUrl.js +64 -0
  66. package/dist/helpers/sanitizeHtml.js +74 -0
  67. package/dist/helpers/saveFileToS3.js +50 -0
  68. package/dist/helpers/scrollToLoadContent.js +57 -0
  69. package/dist/helpers/tests/testClickUntilExhausted.spec.js +372 -0
  70. package/dist/helpers/tests/testDownloadFile.spec.js +206 -0
  71. package/dist/helpers/tests/testExtractMarkdown.spec.js +290 -0
  72. package/dist/helpers/tests/testFilterEmptyValues.spec.js +151 -0
  73. package/dist/helpers/tests/testGoToUrl.spec.js +37 -0
  74. package/dist/helpers/tests/testProcessDate.spec.js +13 -0
  75. package/dist/helpers/tests/testResolveUrl.spec.js +341 -0
  76. package/dist/helpers/tests/testSanitizeHtml.spec.js +330 -0
  77. package/dist/helpers/tests/testScrollToLoadContent.spec.js +163 -0
  78. package/dist/helpers/tests/testValidateDataUsingSchema.spec.js +342 -0
  79. package/dist/helpers/tests/testWithDomSettledWait.spec.js +164 -0
  80. package/dist/helpers/tests/testWithNetworkIdleWait.spec.js +114 -0
  81. package/dist/helpers/types/Attachment.js +115 -0
  82. package/dist/helpers/types/CustomTypeRegistry.js +48 -0
  83. package/dist/helpers/types/RunEnvironment.js +18 -0
  84. package/dist/helpers/types/ValidationError.js +17 -0
  85. package/dist/helpers/types/index.js +51 -0
  86. package/dist/helpers/uploadFileToS3.js +154 -0
  87. package/dist/helpers/utils/getS3Client.js +22 -0
  88. package/dist/helpers/utils/index.js +73 -0
  89. package/dist/helpers/utils/isDownload.js +10 -0
  90. package/dist/helpers/utils/isGenerateCodeMode.js +9 -0
  91. package/dist/helpers/utils/isLocator.js +9 -0
  92. package/dist/helpers/utils/jwtTokenManager.js +18 -0
  93. package/dist/helpers/validateDataUsingSchema.js +103 -0
  94. package/dist/helpers/waitForDomSettled.js +90 -0
  95. package/dist/helpers/withNetworkSettledWait.js +91 -0
  96. package/dist/index.d.js +16 -0
  97. package/dist/index.d.ts +10 -0
  98. package/dist/index.js +16 -0
  99. package/dist/intunedServices/ApiGateway/aiApiGateway.js +143 -0
  100. package/dist/intunedServices/ApiGateway/factory.js +16 -0
  101. package/dist/intunedServices/ApiGateway/providers/Anthropic.js +26 -0
  102. package/dist/intunedServices/ApiGateway/providers/Gemini.js +29 -0
  103. package/dist/intunedServices/ApiGateway/providers/OpenAI.js +29 -0
  104. package/dist/intunedServices/ApiGateway/tests/testApiGateway.spec.js +355 -0
  105. package/dist/intunedServices/ApiGateway/types.js +11 -0
  106. package/dist/intunedServices/cache/cache.js +61 -0
  107. package/dist/intunedServices/cache/index.js +12 -0
  108. package/dist/intunedServices/cache/tests/testCache.spec.js +117 -0
  109. package/dist/optimized-extractors/common/buildExamplesPrompt.js +12 -0
  110. package/dist/optimized-extractors/common/buildImagesFromPage.js +55 -0
  111. package/dist/optimized-extractors/common/extractStructuredDataUsingClaude.js +135 -0
  112. package/dist/optimized-extractors/common/extractStructuredDataUsingGoogle.js +37 -0
  113. package/dist/optimized-extractors/common/extractStructuredDataUsingOpenAi.js +132 -0
  114. package/dist/optimized-extractors/common/extractStrucutredDataUsingAiInstance.js +122 -0
  115. package/dist/optimized-extractors/common/findTableHeaders.js +162 -0
  116. package/dist/optimized-extractors/common/index.js +55 -0
  117. package/dist/optimized-extractors/common/isTableHeaderOrFooter.js +84 -0
  118. package/dist/optimized-extractors/common/matching/matching.js +212 -0
  119. package/dist/optimized-extractors/common/matching/matching.test.js +655 -0
  120. package/dist/optimized-extractors/common/matching/types.js +18 -0
  121. package/dist/optimized-extractors/common/matching/utils.js +184 -0
  122. package/dist/optimized-extractors/common/utils.js +58 -0
  123. package/dist/optimized-extractors/export.d.js +5 -0
  124. package/dist/optimized-extractors/export.d.ts +397 -0
  125. package/dist/optimized-extractors/extractArray.js +120 -0
  126. package/dist/optimized-extractors/extractObject.js +104 -0
  127. package/dist/optimized-extractors/index.d.ts +397 -0
  128. package/dist/optimized-extractors/index.js +31 -0
  129. package/dist/optimized-extractors/listExtractionHelpers/__tests__/dynamicListExtractor.spec.js +269 -0
  130. package/dist/optimized-extractors/listExtractionHelpers/__tests__/findSetOfXpathsToCreateAnArrayExtractor.test.js +22 -0
  131. package/dist/optimized-extractors/listExtractionHelpers/__tests__/getContainerElement.test.js +21 -0
  132. package/dist/optimized-extractors/listExtractionHelpers/__tests__/partOfSameArrayXpath.test.js +42 -0
  133. package/dist/optimized-extractors/listExtractionHelpers/__tests__/testArrayExtractorFromLocator.spec.js +146 -0
  134. package/dist/optimized-extractors/listExtractionHelpers/__tests__/testArrayExtractorFromPage.spec.js +130 -0
  135. package/dist/optimized-extractors/listExtractionHelpers/__tests__/verifyThatAllXpathsArePartOfSameArray.test.js +9 -0
  136. package/dist/optimized-extractors/listExtractionHelpers/dynamicListExtractor.js +160 -0
  137. package/dist/optimized-extractors/listExtractionHelpers/errors.js +46 -0
  138. package/dist/optimized-extractors/listExtractionHelpers/getListMatches.js +14 -0
  139. package/dist/optimized-extractors/listExtractionHelpers/runAiExtraction.js +243 -0
  140. package/dist/optimized-extractors/listExtractionHelpers/typesAndSchema.js +5 -0
  141. package/dist/optimized-extractors/listExtractionHelpers/utils/extractPropertiesUsingGPTFromArray.js +277 -0
  142. package/dist/optimized-extractors/listExtractionHelpers/utils/extractStructuredListUsingAi.js +44 -0
  143. package/dist/optimized-extractors/listExtractionHelpers/utils/getListContainerXpath.js +94 -0
  144. package/dist/optimized-extractors/listExtractionHelpers/utils/getRelativeContainerXpathSelector.js +20 -0
  145. package/dist/optimized-extractors/listExtractionHelpers/utils/getSimplifiedHtmlPerListItem.js +21 -0
  146. package/dist/optimized-extractors/listExtractionHelpers/utils/tablesUtils.js +48 -0
  147. package/dist/optimized-extractors/listExtractionHelpers/utils/validateOptions.js +52 -0
  148. package/dist/optimized-extractors/models/anthropicModel.js +23 -0
  149. package/dist/optimized-extractors/models/openaiModel.js +23 -0
  150. package/dist/optimized-extractors/objectExtractionHelpers/AIExtractors.js +73 -0
  151. package/dist/optimized-extractors/objectExtractionHelpers/__tests__/checksumUtils.test.js +103 -0
  152. package/dist/optimized-extractors/objectExtractionHelpers/__tests__/testObjectExtractorFromLocator.spec.js +107 -0
  153. package/dist/optimized-extractors/objectExtractionHelpers/__tests__/testObjectExtractorFromPage.spec.js +107 -0
  154. package/dist/optimized-extractors/objectExtractionHelpers/calculateObjectExampleHash.js +28 -0
  155. package/dist/optimized-extractors/objectExtractionHelpers/captureSnapshot.js +26 -0
  156. package/dist/optimized-extractors/objectExtractionHelpers/checksumUtils.js +32 -0
  157. package/dist/optimized-extractors/objectExtractionHelpers/constants.js +7 -0
  158. package/dist/optimized-extractors/objectExtractionHelpers/dynamicObjectExtractor.js +106 -0
  159. package/dist/optimized-extractors/objectExtractionHelpers/errors.js +42 -0
  160. package/dist/optimized-extractors/objectExtractionHelpers/findDomMatches.js +54 -0
  161. package/dist/optimized-extractors/objectExtractionHelpers/getSimplifiedHtml.js +122 -0
  162. package/dist/optimized-extractors/objectExtractionHelpers/typesAndSchemas.js +5 -0
  163. package/dist/optimized-extractors/objectExtractionHelpers/validateDynamicObjectExtractorOptions.js +52 -0
  164. package/dist/optimized-extractors/types/aiModelsValidation.js +45 -0
  165. package/dist/optimized-extractors/types/errors.js +42 -0
  166. package/dist/optimized-extractors/types/jsonSchema.d.js +5 -0
  167. package/dist/optimized-extractors/types/jsonSchema.d.ts +50 -0
  168. package/dist/optimized-extractors/types/types.js +5 -0
  169. package/dist/optimized-extractors/validators.js +152 -0
  170. package/dist/types/intuned-runtime.d.js +1 -0
  171. package/dist/types/intuned-runtime.d.ts +64 -0
  172. package/dist/vite-env.d.js +1 -0
  173. package/dist/vite-env.d.ts +9 -0
  174. package/generated-docs/ai/functions/extractStructuredData.mdx +255 -0
  175. package/generated-docs/ai/functions/isPageLoaded.mdx +88 -0
  176. package/generated-docs/ai/interfaces/ArraySchema.mdx +36 -0
  177. package/generated-docs/ai/interfaces/BasicSchema.mdx +14 -0
  178. package/generated-docs/ai/interfaces/BooleanSchema.mdx +28 -0
  179. package/generated-docs/ai/interfaces/ImageBufferContentItem.mdx +16 -0
  180. package/generated-docs/ai/interfaces/ImageUrlContentItem.mdx +16 -0
  181. package/generated-docs/ai/interfaces/NumberSchema.mdx +35 -0
  182. package/generated-docs/ai/interfaces/ObjectSchema.mdx +39 -0
  183. package/generated-docs/ai/interfaces/StringSchema.mdx +35 -0
  184. package/generated-docs/ai/interfaces/TextContentItem.mdx +14 -0
  185. package/generated-docs/ai/type-aliases/ContentItem.mdx +12 -0
  186. package/generated-docs/ai/type-aliases/JsonSchema.mdx +47 -0
  187. package/generated-docs/ai/type-aliases/SUPPORTED_MODELS.mdx +85 -0
  188. package/generated-docs/helpers/functions/downloadFile.mdx +99 -0
  189. package/generated-docs/helpers/functions/extractMarkdown.mdx +56 -0
  190. package/generated-docs/helpers/functions/filterEmptyValues.mdx +51 -0
  191. package/generated-docs/helpers/functions/goToUrl.mdx +124 -0
  192. package/generated-docs/helpers/functions/processDate.mdx +55 -0
  193. package/generated-docs/helpers/functions/resolveUrl.mdx +165 -0
  194. package/generated-docs/helpers/functions/sanitizeHtml.mdx +113 -0
  195. package/generated-docs/helpers/functions/saveFileToS3.mdx +127 -0
  196. package/generated-docs/helpers/functions/scrollToLoadContent.mdx +89 -0
  197. package/generated-docs/helpers/functions/uploadFileToS3.mdx +121 -0
  198. package/generated-docs/helpers/functions/validateDataUsingSchema.mdx +90 -0
  199. package/generated-docs/helpers/functions/waitForDomSettled.mdx +91 -0
  200. package/generated-docs/helpers/functions/withNetworkSettledWait.mdx +76 -0
  201. package/generated-docs/helpers/interfaces/Attachment.mdx +56 -0
  202. package/generated-docs/helpers/interfaces/S3Configs.mdx +52 -0
  203. package/generated-docs/helpers/interfaces/SanitizeHtmlOptions.mdx +22 -0
  204. package/generated-docs/helpers/type-aliases/AttachmentType.mdx +10 -0
  205. package/generated-docs/helpers/type-aliases/FileType.mdx +61 -0
  206. package/generated-docs/helpers/type-aliases/Trigger.mdx +62 -0
  207. package/how-to-generate-docs.md +61 -0
  208. package/how-to-run-tests.md +42 -0
  209. package/intuned-runtime-setup.md +13 -0
  210. package/package.json +124 -0
  211. package/tsconfig.eslint.json +5 -0
  212. package/tsconfig.json +26 -0
@@ -0,0 +1,2602 @@
1
+ "use strict";
2
+
3
+ Object.defineProperty(exports, "__esModule", {
4
+ value: true
5
+ });
6
+ exports.BROWSER_SCRIPT = void 0;
7
+ const BROWSER_SCRIPT = exports.BROWSER_SCRIPT = `(function () {
8
+ "use strict";
9
+
10
+ var MatchSource;
11
+ (function (MatchSource) {
12
+ MatchSource["ATTRIBUTE"] = "attribute";
13
+ MatchSource["TEXT_CONTENT"] = "text_content";
14
+ MatchSource["DIRECT_TEXT_NODE"] = "direct_text_node";
15
+ })(MatchSource || (MatchSource = {}));
16
+ var MatchMode;
17
+ (function (MatchMode) {
18
+ MatchMode["FULL"] = "full";
19
+ MatchMode["PARTIAL"] = "partial";
20
+ MatchMode["FUZZY"] = "fuzzy";
21
+ })(MatchMode || (MatchMode = {}));
22
+
23
+ function* searchExact(needle, haystack, startIndex = 0, endIndex = null) {
24
+ const needleLen = needle.length;
25
+ if (needleLen === 0) return;
26
+ if (endIndex === null) {
27
+ endIndex = haystack.length;
28
+ }
29
+ let index;
30
+ while ((index = haystack.indexOf(needle, startIndex)) > -1) {
31
+ if (index + needle.length > endIndex) break;
32
+ yield index;
33
+ startIndex = index + 1;
34
+ }
35
+ }
36
+ function reverse(string) {
37
+ return string.split("").reverse().join("");
38
+ }
39
+
40
+ function makeChar2needleIdx(needle, maxDist) {
41
+ const res = {};
42
+ for (let i = Math.min(needle.length - 1, maxDist); i >= 0; i--) {
43
+ res[needle[i]] = i;
44
+ }
45
+ return res;
46
+ }
47
+ function* fuzzySearch(needle, haystack, maxDist) {
48
+ if (needle.length > haystack.length + maxDist) return;
49
+ const ngramLen = Math.floor(needle.length / (maxDist + 1));
50
+ if (maxDist === 0) {
51
+ for (const index of searchExact(needle, haystack)) {
52
+ yield {
53
+ start: index,
54
+ end: index + needle.length,
55
+ dist: 0,
56
+ };
57
+ }
58
+ } else if (ngramLen >= 10) {
59
+ yield* fuzzySearchNgrams(needle, haystack, maxDist);
60
+ } else {
61
+ yield* fuzzySearchCandidates(needle, haystack, maxDist);
62
+ }
63
+ }
64
+ function _expand(needle, haystack, maxDist) {
65
+ maxDist = +maxDist;
66
+ let firstDiff;
67
+ for (
68
+ firstDiff = 0;
69
+ firstDiff < Math.min(needle.length, haystack.length);
70
+ firstDiff++
71
+ ) {
72
+ if (needle.charCodeAt(firstDiff) !== haystack.charCodeAt(firstDiff))
73
+ break;
74
+ }
75
+ if (firstDiff) {
76
+ needle = needle.slice(firstDiff);
77
+ haystack = haystack.slice(firstDiff);
78
+ }
79
+ if (!needle) {
80
+ return [0, firstDiff];
81
+ } else if (!haystack) {
82
+ if (needle.length <= maxDist) {
83
+ return [needle.length, firstDiff];
84
+ } else {
85
+ return [null, null];
86
+ }
87
+ }
88
+ if (maxDist === 0) return [null, null];
89
+ let scores = new Array(needle.length + 1);
90
+ for (let i = 0; i <= maxDist; i++) {
91
+ scores[i] = i;
92
+ }
93
+ let newScores = new Array(needle.length + 1);
94
+ let minScore = null;
95
+ let minScoreIdx = null;
96
+ let maxGoodScore = maxDist;
97
+ let firstGoodScoreIdx = 0;
98
+ let lastGoodScoreIdx = needle.length - 1;
99
+ for (let haystackIdx = 0; haystackIdx < haystack.length; haystackIdx++) {
100
+ const char = haystack.charCodeAt(haystackIdx);
101
+ const needleIdxStart = Math.max(0, firstGoodScoreIdx - 1);
102
+ const needleIdxLimit = Math.min(
103
+ haystackIdx + maxDist,
104
+ needle.length - 1,
105
+ lastGoodScoreIdx
106
+ );
107
+ newScores[0] = scores[0] + 1;
108
+ firstGoodScoreIdx = newScores[0] <= maxGoodScore ? 0 : null;
109
+ lastGoodScoreIdx = newScores[0] <= maxGoodScore ? 0 : -1;
110
+ let needleIdx;
111
+ for (
112
+ needleIdx = needleIdxStart;
113
+ needleIdx < needleIdxLimit;
114
+ needleIdx++
115
+ ) {
116
+ const score = (newScores[needleIdx + 1] = Math.min(
117
+ scores[needleIdx] + +(char !== needle.charCodeAt(needleIdx)),
118
+ scores[needleIdx + 1] + 1,
119
+ newScores[needleIdx] + 1
120
+ ));
121
+ if (score <= maxGoodScore) {
122
+ if (firstGoodScoreIdx === null) firstGoodScoreIdx = needleIdx + 1;
123
+ lastGoodScoreIdx = Math.max(
124
+ lastGoodScoreIdx,
125
+ needleIdx + 1 + (maxGoodScore - score)
126
+ );
127
+ }
128
+ }
129
+ const lastScore = (newScores[needleIdx + 1] = Math.min(
130
+ scores[needleIdx] + +(char !== needle.charCodeAt(needleIdx)),
131
+ newScores[needleIdx] + 1
132
+ ));
133
+ if (lastScore <= maxGoodScore) {
134
+ if (firstGoodScoreIdx === null) firstGoodScoreIdx = needleIdx + 1;
135
+ lastGoodScoreIdx = needleIdx + 1;
136
+ }
137
+ if (
138
+ needleIdx === needle.length - 1 &&
139
+ (minScore === null || lastScore <= minScore)
140
+ ) {
141
+ minScore = lastScore;
142
+ minScoreIdx = haystackIdx;
143
+ if (minScore < maxGoodScore) maxGoodScore = minScore;
144
+ }
145
+ [scores, newScores] = [newScores, scores];
146
+ if (firstGoodScoreIdx === null) break;
147
+ }
148
+ if (minScore !== null && minScore <= maxDist) {
149
+ return [minScore, minScoreIdx + 1 + firstDiff];
150
+ } else {
151
+ return [null, null];
152
+ }
153
+ }
154
+ function* fuzzySearchNgrams(needle, haystack, maxDist) {
155
+ // use n-gram search
156
+ const ngramLen = Math.floor(needle.length / (maxDist + 1));
157
+ const needleLen = needle.length;
158
+ const haystackLen = haystack.length;
159
+ for (
160
+ let ngramStartIdx = 0;
161
+ ngramStartIdx <= needle.length - ngramLen;
162
+ ngramStartIdx += ngramLen
163
+ ) {
164
+ const ngram = needle.slice(ngramStartIdx, ngramStartIdx + ngramLen);
165
+ const ngramEnd = ngramStartIdx + ngramLen;
166
+ const needleBeforeReversed = reverse(needle.slice(0, ngramStartIdx));
167
+ const needleAfter = needle.slice(ngramEnd);
168
+ const startIdx = Math.max(0, ngramStartIdx - maxDist);
169
+ const endIdx = Math.min(
170
+ haystackLen,
171
+ haystackLen - needleLen + ngramEnd + maxDist
172
+ );
173
+ for (const haystackMatchIdx of searchExact(
174
+ ngram,
175
+ haystack,
176
+ startIdx,
177
+ endIdx
178
+ )) {
179
+ // try to expand left
180
+ const [distRight, rightExpandSize] = _expand(
181
+ needleAfter,
182
+ haystack.slice(
183
+ haystackMatchIdx + ngramLen,
184
+ haystackMatchIdx - ngramStartIdx + needleLen + maxDist
185
+ ),
186
+ maxDist
187
+ );
188
+ if (distRight === null) continue;
189
+ const [distLeft, leftExpandSize] = _expand(
190
+ needleBeforeReversed,
191
+ reverse(
192
+ haystack.slice(
193
+ Math.max(
194
+ 0,
195
+ haystackMatchIdx - ngramStartIdx - (maxDist - distRight)
196
+ ),
197
+ haystackMatchIdx
198
+ )
199
+ ),
200
+ maxDist - distRight
201
+ );
202
+ if (distLeft === null) continue;
203
+ yield {
204
+ start: haystackMatchIdx - leftExpandSize,
205
+ end: haystackMatchIdx + ngramLen + rightExpandSize,
206
+ dist: distLeft + distRight,
207
+ };
208
+ }
209
+ }
210
+ }
211
+ function* fuzzySearchCandidates(needle, haystack, maxDist) {
212
+ const needleLen = needle.length;
213
+ const haystackLen = haystack.length;
214
+ if (needleLen > haystackLen + maxDist) return;
215
+ const char2needleIdx = makeChar2needleIdx(needle, maxDist);
216
+ let prevCandidates = new Map(); // candidates from the last iteration
217
+ let candidates = new Map(); // new candidates from the current iteration
218
+ // iterate over the chars in the haystack, updating the candidates for each
219
+ for (let i = 0; i < haystack.length; i++) {
220
+ const haystackChar = haystack[i];
221
+ prevCandidates = candidates;
222
+ candidates = new Map();
223
+ const needleIdx = char2needleIdx[haystackChar];
224
+ if (needleIdx !== undefined) {
225
+ if (needleIdx + 1 === needleLen) {
226
+ yield {
227
+ start: i,
228
+ end: i + 1,
229
+ dist: needleIdx,
230
+ };
231
+ } else {
232
+ candidates.set(\`\${i},\${needleIdx + 1},\${needleIdx}\`, {
233
+ startIdx: i,
234
+ needleIdx: needleIdx + 1,
235
+ dist: needleIdx,
236
+ });
237
+ }
238
+ }
239
+ for (const [, candidate] of prevCandidates) {
240
+ // if this sequence char is the candidate's next expected char
241
+ if (needle[candidate.needleIdx] === haystackChar) {
242
+ // if reached the end of the needle, return a match
243
+ if (candidate.needleIdx + 1 === needleLen) {
244
+ yield {
245
+ start: candidate.startIdx,
246
+ end: i + 1,
247
+ dist: candidate.dist,
248
+ };
249
+ } else {
250
+ // otherwise, update the candidate's needleIdx and keep it
251
+ candidates.set(
252
+ \`\${candidate.startIdx},\${candidate.needleIdx + 1},\${
253
+ candidate.dist
254
+ }\`,
255
+ {
256
+ startIdx: candidate.startIdx,
257
+ needleIdx: candidate.needleIdx + 1,
258
+ dist: candidate.dist,
259
+ }
260
+ );
261
+ }
262
+ } else {
263
+ if (candidate.dist === maxDist) continue;
264
+ candidates.set(
265
+ \`\${candidate.startIdx},\${candidate.needleIdx},\${
266
+ candidate.dist + 1
267
+ }\`,
268
+ {
269
+ startIdx: candidate.startIdx,
270
+ needleIdx: candidate.needleIdx,
271
+ dist: candidate.dist + 1,
272
+ }
273
+ );
274
+ for (
275
+ let nSkipped = 1;
276
+ nSkipped <= maxDist - candidate.dist;
277
+ nSkipped++
278
+ ) {
279
+ if (candidate.needleIdx + nSkipped === needleLen) {
280
+ yield {
281
+ start: candidate.startIdx,
282
+ end: i + 1,
283
+ dist: candidate.dist + nSkipped,
284
+ };
285
+ break;
286
+ } else if (
287
+ needle[candidate.needleIdx + nSkipped] === haystackChar
288
+ ) {
289
+ if (candidate.needleIdx + nSkipped + 1 === needleLen) {
290
+ yield {
291
+ start: candidate.startIdx,
292
+ end: i + 1,
293
+ dist: candidate.dist + nSkipped,
294
+ };
295
+ } else {
296
+ candidates.set(
297
+ \`\${candidate.startIdx},\${
298
+ candidate.needleIdx + 1 + nSkipped
299
+ },\${candidate.dist + nSkipped}\`,
300
+ {
301
+ startIdx: candidate.startIdx,
302
+ needleIdx: candidate.needleIdx + 1 + nSkipped,
303
+ dist: candidate.dist + nSkipped,
304
+ }
305
+ );
306
+ }
307
+ break;
308
+ }
309
+ }
310
+ if (i + 1 < haystackLen && candidate.needleIdx + 1 < needleLen) {
311
+ candidates.set(
312
+ \`\${candidate.startIdx},\${candidate.needleIdx + 1},\${
313
+ candidate.dist + 1
314
+ }\`,
315
+ {
316
+ startIdx: candidate.startIdx,
317
+ needleIdx: candidate.needleIdx + 1,
318
+ dist: candidate.dist + 1,
319
+ }
320
+ );
321
+ }
322
+ }
323
+ }
324
+ }
325
+ for (const [, candidate] of candidates) {
326
+ candidate.dist += needle.length - candidate.needleIdx;
327
+ if (candidate.dist <= maxDist) {
328
+ yield {
329
+ start: candidate.startIdx,
330
+ end: haystack.length,
331
+ dist: candidate.dist,
332
+ };
333
+ }
334
+ }
335
+ }
336
+
337
+ function findClosestMatch(searchTerm, content, maxLDist) {
338
+ const results = [];
339
+ for (const result of fuzzySearch(searchTerm, content, maxLDist)) {
340
+ results.push(result);
341
+ }
342
+ results.sort((a, b) => {
343
+ if (a.dist === b.dist) {
344
+ return b.end - b.start - (a.end - a.start); // Sort by match length if distances are equal
345
+ }
346
+ return a.dist - b.dist; // Sort by distance
347
+ });
348
+ return results[0];
349
+ }
350
+ function normalizeSpacing(text) {
351
+ if (!text) {
352
+ return "";
353
+ }
354
+ // Replace newlines and tabs with spaces
355
+ let normalized = text.replace(/\\n/g, " ").replace(/\\t/g, " ");
356
+ // Replace multiple spaces with a single space
357
+ normalized = normalized.split(/\\s+/).join(" ");
358
+ return normalized.trim();
359
+ }
360
+ function isMatchExact(data, value) {
361
+ if (!data || !value) {
362
+ return [false, null];
363
+ }
364
+ const normalizedData = normalizeSpacing(data);
365
+ const normalizedValue = normalizeSpacing(value);
366
+ return [normalizedData === normalizedValue, normalizedValue];
367
+ }
368
+ function calculateMaxLDist(value) {
369
+ const length = value.length;
370
+ const Pmax = 0.2;
371
+ const Pmin = 0.05;
372
+ const lengthAtPmax = 10;
373
+ let percentage;
374
+ if (length <= lengthAtPmax) {
375
+ percentage = Pmax;
376
+ } else {
377
+ const k = -Math.log(Pmin / Pmax) / (600 - lengthAtPmax);
378
+ percentage = Pmax * Math.exp(-k * (length - lengthAtPmax));
379
+ }
380
+ percentage = Math.max(Pmin, percentage);
381
+ return Math.max(1, Math.floor(length * percentage));
382
+ }
383
+ function isFuzzMatch(searchTerm, content) {
384
+ if (!searchTerm || !content) {
385
+ return {
386
+ found: false,
387
+ matchedValue: null,
388
+ distance: null,
389
+ matchedSourceValue: null,
390
+ };
391
+ }
392
+ const maxLDist = calculateMaxLDist(searchTerm);
393
+ const normalizedSearchTerm = normalizeSpacing(searchTerm);
394
+ const normalizedContent = normalizeSpacing(content);
395
+ const match = findClosestMatch(
396
+ normalizedSearchTerm.toLowerCase(),
397
+ normalizedContent.toLowerCase(),
398
+ maxLDist
399
+ );
400
+ if (!match) {
401
+ return {
402
+ found: false,
403
+ matchedValue: null,
404
+ distance: null,
405
+ matchedSourceValue: null,
406
+ };
407
+ }
408
+ return {
409
+ found: true,
410
+ matchedValue: normalizedContent.slice(match.start, match.end),
411
+ matchedSourceValue: normalizedContent,
412
+ distance: match.dist,
413
+ };
414
+ }
415
+ function hasNonFuzzyOrCloseFuzzyMatch(matches) {
416
+ const hasNonFuzzyMatch = matches.some(
417
+ (match) => match.match_mode !== MatchMode.FUZZY
418
+ );
419
+ const hasVeryCloseFuzzyMatch = matches.some(
420
+ (match) =>
421
+ match.match_mode === MatchMode.FUZZY &&
422
+ match.fuzzy_distance &&
423
+ match.fuzzy_distance < 5
424
+ );
425
+ return hasNonFuzzyMatch || hasVeryCloseFuzzyMatch;
426
+ }
427
+ function getElementXPath(element) {
428
+ if (!element || !element.parentNode || element.nodeName === "#document") {
429
+ return null;
430
+ }
431
+ let siblingsCount = 1;
432
+ const parent = element.parentNode;
433
+ const nodeName = element.nodeName.toLowerCase();
434
+ const siblings = Array.from(parent.childNodes).filter(
435
+ (node) => node.nodeType === 1 // Node.ELEMENT_NODE
436
+ );
437
+ for (const sibling of siblings) {
438
+ if (sibling === element) {
439
+ break;
440
+ }
441
+ if (sibling.nodeName.toLowerCase() === nodeName) {
442
+ siblingsCount++;
443
+ }
444
+ }
445
+ const parentXPath = getElementXPath(parent);
446
+ if (element.nodeName === "#text") {
447
+ return parentXPath;
448
+ }
449
+
450
+ let nodeXPath;
451
+ if (
452
+ element.namespaceURI &&
453
+ element.namespaceURI !== "http://www.w3.org/1999/xhtml" // HTML namespace, this will make xpath locator succeed without [name()='']
454
+ ) {
455
+ // Element is in a namespace (SVG, MathML, or custom namespace)
456
+ nodeXPath = \`*[name()='\${nodeName}']\`;
457
+ } else {
458
+ // Standard HTML element
459
+ nodeXPath = \`\${nodeName}[\${siblingsCount}]\`;
460
+ }
461
+
462
+ return parentXPath ? \`\${parentXPath}/\${nodeXPath}\` : nodeXPath;
463
+ }
464
+ function traverseAndPrune(node, conditionFunc) {
465
+ const children = Array.from(node.children ?? []);
466
+ children.forEach((child) => {
467
+ if (child.children) {
468
+ if (!conditionFunc(child)) {
469
+ traverseAndPrune(child, conditionFunc);
470
+ }
471
+ }
472
+ });
473
+ }
474
+ function isPartOfString(input, dom) {
475
+ if (!input || !dom) {
476
+ return [false, null, null];
477
+ }
478
+ const normalizedInput = normalizeSpacing(input);
479
+ const normalizedDom = normalizeSpacing(dom);
480
+ const matchIndex = normalizedDom
481
+ .toLowerCase()
482
+ .indexOf(normalizedInput.toLowerCase());
483
+ const matchedText =
484
+ matchIndex !== -1
485
+ ? normalizedDom.substring(
486
+ matchIndex,
487
+ matchIndex + normalizedInput.length
488
+ )
489
+ : null;
490
+ return [matchIndex !== -1, matchedText, normalizedDom];
491
+ }
492
+
493
+ function matchStringsWithDomContent(domNode, stringsList) {
494
+ const exactMatchedMap = matchExactStrings(domNode, stringsList);
495
+ const stringsWithNoExactMatch = stringsList.filter(
496
+ (data) => !hasNonFuzzyOrCloseFuzzyMatch(exactMatchedMap[data])
497
+ );
498
+ if (stringsWithNoExactMatch.length === 0) {
499
+ return exactMatchedMap;
500
+ }
501
+ const fuzzMatchedMap = matchFuzzyStrings(domNode, stringsWithNoExactMatch);
502
+ for (const [data, fuzzyMatches] of Object.entries(fuzzMatchedMap)) {
503
+ if (data in exactMatchedMap) {
504
+ exactMatchedMap[data].push(...fuzzyMatches);
505
+ } else {
506
+ exactMatchedMap[data] = fuzzyMatches;
507
+ }
508
+ }
509
+ // attributes to try fuzzy match attributes on
510
+ const stringsWithNoMatch = stringsList.filter(
511
+ (data) => !hasNonFuzzyOrCloseFuzzyMatch(exactMatchedMap[data])
512
+ );
513
+ const attributesFuzzyMatchedMap = matchFuzzyAttributes(
514
+ domNode,
515
+ stringsWithNoMatch
516
+ );
517
+ for (const [data, attributeFuzzyMatches] of Object.entries(
518
+ attributesFuzzyMatchedMap
519
+ )) {
520
+ if (data in exactMatchedMap) {
521
+ exactMatchedMap[data].push(...attributeFuzzyMatches);
522
+ } else {
523
+ exactMatchedMap[data] = attributeFuzzyMatches;
524
+ }
525
+ }
526
+ return exactMatchedMap;
527
+ }
528
+ function matchExactStrings(domNode, stringsList) {
529
+ const allNodes = [
530
+ domNode,
531
+ ...Array.from(domNode.querySelectorAll("*")),
532
+ ].reverse();
533
+ const matchesMap = Object.fromEntries(
534
+ stringsList.map((data) => [data, []])
535
+ );
536
+ for (const tag of allNodes) {
537
+ const xpath = getElementXPath(tag);
538
+ for (const stringValue of stringsList) {
539
+ const matchesXPaths = matchesMap[stringValue].map(
540
+ (match) => match.xpath || ""
541
+ );
542
+ const xpathIsChildOfMatch = matchesXPaths.some(
543
+ (matchXPath) => matchXPath !== xpath && matchXPath.startsWith(xpath)
544
+ );
545
+ if (xpathIsChildOfMatch) continue;
546
+ const attributeNames = tag.getAttributeNames();
547
+ for (const attr of attributeNames) {
548
+ const attributeValue = tag.getAttribute(attr) || "";
549
+ const [isPartOfStringResult, matchedValue] = isPartOfString(
550
+ stringValue,
551
+ attributeValue
552
+ );
553
+ if (isPartOfStringResult) {
554
+ const [isExact] = isMatchExact(stringValue, attributeValue);
555
+ matchesMap[stringValue].push({
556
+ attribute: attr,
557
+ fuzzy_distance: null,
558
+ match_mode: isExact ? MatchMode.FULL : MatchMode.PARTIAL,
559
+ match_source: MatchSource.ATTRIBUTE,
560
+ matched_value: matchedValue,
561
+ matched_source_value: attributeValue,
562
+ tag: tag.tagName.toLowerCase(),
563
+ xpath,
564
+ });
565
+ }
566
+ }
567
+ if (tag["href"]) {
568
+ const result = matchHref(tag, stringValue);
569
+ if (result) {
570
+ matchesMap[stringValue].push(result);
571
+ }
572
+ }
573
+ // Check for direct text nodes
574
+ for (const childNode of tag.childNodes) {
575
+ // Node.TEXT_NODE
576
+ if (childNode.nodeType === 3) {
577
+ const directTextContent = childNode.textContent?.trim() || "";
578
+ if (directTextContent) {
579
+ const [isPartOfStringResult, matchedValue, source_value] =
580
+ isPartOfString(stringValue, directTextContent);
581
+ if (isPartOfStringResult) {
582
+ const [isExact] = isMatchExact(stringValue, directTextContent);
583
+ matchesMap[stringValue].push({
584
+ attribute: null,
585
+ fuzzy_distance: null,
586
+ match_mode: isExact ? MatchMode.FULL : MatchMode.PARTIAL,
587
+ match_source: MatchSource.DIRECT_TEXT_NODE,
588
+ matched_value: matchedValue,
589
+ matched_source_value: source_value,
590
+ tag: tag.tagName.toLowerCase(),
591
+ xpath,
592
+ });
593
+ }
594
+ }
595
+ }
596
+ }
597
+ const tagTextContent = tag.textContent || "";
598
+ const [isPartOfStringResult, matchedValue, source_value] =
599
+ isPartOfString(stringValue, tagTextContent);
600
+ if (isPartOfStringResult) {
601
+ const [isExact] = isMatchExact(stringValue, tagTextContent);
602
+ matchesMap[stringValue].push({
603
+ attribute: null,
604
+ fuzzy_distance: null,
605
+ match_mode: isExact ? MatchMode.FULL : MatchMode.PARTIAL,
606
+ match_source: MatchSource.TEXT_CONTENT,
607
+ matched_value: matchedValue,
608
+ matched_source_value: source_value,
609
+ tag: tag.tagName.toLowerCase(),
610
+ xpath,
611
+ });
612
+ }
613
+ }
614
+ }
615
+ return matchesMap;
616
+ }
617
+ function matchFuzzyStrings(domNode, stringsToMatch) {
618
+ const matchesMap = Object.fromEntries(
619
+ stringsToMatch.map((data) => [data, []])
620
+ );
621
+ const conditionFunc = (stringToMatch, node) => {
622
+ let foundMatch = false;
623
+ const currentXPath = getElementXPath(node);
624
+ for (const attr of node.getAttributeNames()) {
625
+ const attributeValue = node.getAttribute(attr) || "";
626
+ const {
627
+ found: isFuzzMatchFound,
628
+ matchedValue,
629
+ distance: dist,
630
+ matchedSourceValue,
631
+ } = isFuzzMatch(stringToMatch, attributeValue);
632
+ if (isFuzzMatchFound) {
633
+ matchesMap[stringToMatch].push({
634
+ attribute: attr,
635
+ fuzzy_distance: dist,
636
+ match_mode: MatchMode.FUZZY,
637
+ match_source: MatchSource.ATTRIBUTE,
638
+ matched_value: matchedValue,
639
+ tag: node.tagName.toLowerCase(),
640
+ xpath: currentXPath,
641
+ matched_source_value: matchedSourceValue,
642
+ });
643
+ foundMatch = true;
644
+ }
645
+ }
646
+ const tagTextContent = node.textContent || "";
647
+ if (tagTextContent) {
648
+ const {
649
+ found: isFuzzMatchFound,
650
+ matchedValue,
651
+ distance: dist,
652
+ matchedSourceValue,
653
+ } = isFuzzMatch(stringToMatch, tagTextContent);
654
+ if (isFuzzMatchFound) {
655
+ matchesMap[stringToMatch].push({
656
+ attribute: null,
657
+ fuzzy_distance: dist,
658
+ match_mode: MatchMode.FUZZY,
659
+ match_source: MatchSource.TEXT_CONTENT,
660
+ matched_value: matchedValue,
661
+ tag: node.tagName.toLowerCase(),
662
+ xpath: currentXPath,
663
+ matched_source_value: matchedSourceValue,
664
+ });
665
+ foundMatch = true;
666
+ }
667
+ }
668
+ // Check for direct text nodes
669
+ for (const childNode of node.childNodes) {
670
+ // Node.TEXT_NODE
671
+ if (childNode.nodeType === 3) {
672
+ const directTextContent = childNode.textContent?.trim() || "";
673
+ if (directTextContent) {
674
+ const {
675
+ found: isFuzzMatchFound,
676
+ matchedValue,
677
+ distance: dist,
678
+ matchedSourceValue,
679
+ } = isFuzzMatch(stringToMatch, directTextContent);
680
+ if (isFuzzMatchFound) {
681
+ matchesMap[stringToMatch].push({
682
+ attribute: null,
683
+ fuzzy_distance: dist,
684
+ match_mode: MatchMode.FUZZY,
685
+ match_source: MatchSource.DIRECT_TEXT_NODE,
686
+ matched_value: matchedValue,
687
+ tag: node.tagName.toLowerCase(),
688
+ xpath: currentXPath,
689
+ matched_source_value: matchedSourceValue,
690
+ });
691
+ foundMatch = true;
692
+ }
693
+ }
694
+ }
695
+ }
696
+ return !foundMatch;
697
+ };
698
+ for (const stringToMatch of stringsToMatch) {
699
+ conditionFunc(stringToMatch, domNode);
700
+ traverseAndPrune(domNode, (node) => conditionFunc(stringToMatch, node));
701
+ }
702
+ for (const [stringToMatch, matches] of Object.entries(matchesMap)) {
703
+ const matchesToRemove = new Set();
704
+ matches.forEach((match, i) => {
705
+ for (const otherMatch of matches.slice(i + 1)) {
706
+ if ((otherMatch.xpath || "").startsWith((match.xpath || "") + "/")) {
707
+ matchesToRemove.add(i);
708
+ break;
709
+ }
710
+ }
711
+ });
712
+ matchesMap[stringToMatch] = matches.filter(
713
+ (_, i) => !matchesToRemove.has(i)
714
+ );
715
+ }
716
+ return matchesMap;
717
+ }
718
+ function matchFuzzyAttributes(domNode, stringsToMatch) {
719
+ const matchesMap = Object.fromEntries(
720
+ stringsToMatch.map((data) => [data, []])
721
+ );
722
+ const allAttributes = getAllAttributes(domNode);
723
+ for (const stringToMatch of stringsToMatch) {
724
+ const stringToSearchIn = allAttributes
725
+ .filter((attr) => attr.value.length > 10)
726
+ .filter((attr) => {
727
+ const lengthDiff = Math.abs(attr.value.length - stringToMatch.length);
728
+ return lengthDiff <= 0.2 * stringToMatch.length;
729
+ })
730
+ .map((attr) => attr.value)
731
+ .join("\\n");
732
+ const {
733
+ found: isFuzzMatchFound,
734
+ matchedValue,
735
+ distance: dist,
736
+ } = isFuzzMatch(stringToMatch, stringToSearchIn);
737
+ if (isFuzzMatchFound) {
738
+ const matchLine = allAttributes.find(
739
+ (attr) => matchedValue && attr.value.includes(matchedValue)
740
+ );
741
+ if (!matchLine) continue;
742
+ matchesMap[stringToMatch].push({
743
+ attribute: matchLine.attr,
744
+ fuzzy_distance: dist,
745
+ match_mode: MatchMode.FUZZY,
746
+ match_source: MatchSource.ATTRIBUTE,
747
+ matched_value: matchedValue,
748
+ xpath: matchLine.node,
749
+ matched_source_value: matchLine.value,
750
+ tag: matchLine.tag,
751
+ });
752
+ }
753
+ }
754
+ return matchesMap;
755
+ }
756
+ function getAllAttributes(node) {
757
+ const allNodes = [
758
+ node,
759
+ ...Array.from(node.querySelectorAll("*")),
760
+ ].reverse();
761
+ return allNodes.flatMap((node) =>
762
+ node
763
+ .getAttributeNames()
764
+ .map((attr) => ({
765
+ node: getElementXPath(node),
766
+ attr,
767
+ value: node.getAttribute(attr) || "",
768
+ tag: node.tagName.toLowerCase(),
769
+ }))
770
+ .filter((i) => i.value.length > 10)
771
+ );
772
+ }
773
+ function matchHref(node, stringToMatch) {
774
+ if (!node["href"] || typeof node["href"] !== "string") {
775
+ return;
776
+ }
777
+ const attributeValue = node["href"] || "";
778
+ let [isPartOfStringResult, matchedValue] = isPartOfString(
779
+ stringToMatch,
780
+ attributeValue
781
+ );
782
+ if (isPartOfStringResult) {
783
+ const [isExact] = isMatchExact(stringToMatch, attributeValue);
784
+ return {
785
+ attribute: "href",
786
+ fuzzy_distance: null,
787
+ match_mode: isExact ? MatchMode.FULL : MatchMode.PARTIAL,
788
+ match_source: MatchSource.ATTRIBUTE,
789
+ matched_value: matchedValue,
790
+ matched_source_value: attributeValue,
791
+ tag: node.tagName.toLowerCase(),
792
+ xpath: getElementXPath(node),
793
+ };
794
+ }
795
+ let decodedStringToMatch;
796
+ try {
797
+ decodedStringToMatch = decodeURI(stringToMatch);
798
+ } catch (e) {
799
+ console.log("failed to decode stringToMatch", stringToMatch);
800
+ return;
801
+ }
802
+ [isPartOfStringResult, matchedValue] = isPartOfString(
803
+ decodedStringToMatch,
804
+ attributeValue
805
+ );
806
+ if (isPartOfStringResult) {
807
+ const [isExact] = isMatchExact(stringToMatch, attributeValue);
808
+ return {
809
+ attribute: "href",
810
+ fuzzy_distance: null,
811
+ match_mode: isExact ? MatchMode.FULL : MatchMode.PARTIAL,
812
+ match_source: MatchSource.ATTRIBUTE,
813
+ matched_value: matchedValue,
814
+ matched_source_value: attributeValue,
815
+ tag: node.tagName.toLowerCase(),
816
+ xpath: getElementXPath(node),
817
+ };
818
+ }
819
+ }
820
+
821
+ function convertElementToMarkdown(element) {
822
+ const mdCharsMatcher = /([\\\\[\\]()])/g;
823
+ function escapeMd(text) {
824
+ // Escapes markdown-sensitive characters within other markdown constructs.
825
+ return text.replace(mdCharsMatcher, "\\\\$1");
826
+ }
827
+ function listNumberingStart(attrs) {
828
+ const start = attrs.getNamedItem("start")?.value;
829
+ if (start) {
830
+ return parseInt(start, 10) - 1;
831
+ } else {
832
+ return 0;
833
+ }
834
+ }
835
+ // Define the characters that require escaping
836
+ const slashChars = "\\\\\`*_{}[]()#+-.!";
837
+ // Escape any special regex characters in slashChars
838
+ const escapedSlashChars = slashChars.replace(
839
+ /[-/\\\\^$*+?.()|[\\]{}]/g,
840
+ "\\\\$&"
841
+ );
842
+ // Create the regular expression
843
+ const mdBackslashMatcher = new RegExp(
844
+ \`\\\\\\\\(?=[\${escapedSlashChars}])\`,
845
+ "g"
846
+ );
847
+ const mdDotMatcher = new RegExp(\`^(\\\\s*\\\\d+)(\\\\.)(?=\\\\s)\`, "gm");
848
+ const mdPlusMatcher = new RegExp(\`^(\\\\s*)(\\\\+)(?=\\\\s)\`, "gm");
849
+ const mdDashMatcher = new RegExp(\`^(\\\\s*)(-)(?=\\\\s|-)\`, "gm");
850
+ function escapeMdSection(text) {
851
+ text = text.replace(mdBackslashMatcher, "\\\\\\\\");
852
+ text = text.replace(mdDotMatcher, "$1\\\\$2");
853
+ text = text.replace(mdPlusMatcher, "$1\\\\$2");
854
+ text = text.replace(mdDashMatcher, "$1\\\\$2");
855
+ return text;
856
+ }
857
+ function isFirstTbody(element) {
858
+ const previousSibling = element.previousSibling;
859
+ return (
860
+ element.nodeName === "TBODY" &&
861
+ (!previousSibling ||
862
+ (previousSibling.nodeName === "THEAD" &&
863
+ /^\\s*$/i.test(previousSibling.textContent ?? "")))
864
+ );
865
+ }
866
+ function isHeadingRow(tr) {
867
+ const parentNode = tr.parentNode;
868
+ return (
869
+ parentNode.nodeName === "THEAD" ||
870
+ (parentNode.firstChild === tr &&
871
+ (parentNode.nodeName === "TABLE" || isFirstTbody(parentNode)) &&
872
+ Array.from(tr.childNodes).every(function (n) {
873
+ return n.nodeName === "TH";
874
+ }))
875
+ );
876
+ }
877
+ class Html2Text {
878
+ p_p = 0;
879
+ abbrData; // last inner HTML (for abbr being defined)
880
+ pre = false;
881
+ code = false;
882
+ startPre = false;
883
+ blockquote = 0;
884
+ list = [];
885
+ start = true;
886
+ breakToggle = "";
887
+ space;
888
+ lastWasNewLine = false;
889
+ a = null;
890
+ outCount = 0;
891
+ baseurl;
892
+ abbrList = {};
893
+ outText = "";
894
+ outTextList = [];
895
+ abbr_title;
896
+ skipInternalLinks = true;
897
+ aStack = [];
898
+ maybeAutomaticLink;
899
+ lastWasList = false;
900
+ absoluteUrlMatcher = new RegExp("^[a-zA-Z+]+://");
901
+ emphasis_mark = "_";
902
+ strong_mark = "**";
903
+ break() {
904
+ if (this.p_p === 0) {
905
+ this.p_p = 1;
906
+ }
907
+ }
908
+ softBreak() {
909
+ this.break();
910
+ this.breakToggle = " ";
911
+ }
912
+ processOutput(data, pureData = 0, force = 0) {
913
+ if (this.abbrData !== undefined) {
914
+ this.abbrData += data;
915
+ }
916
+ if (pureData && !this.pre) {
917
+ data = data.replace(/\\s+/g, " ");
918
+ if (data && data[0] === " ") {
919
+ this.space = 1;
920
+ data = data.substring(1);
921
+ }
922
+ }
923
+ if (!data && force !== "end") return;
924
+ if (this.startPre) {
925
+ if (!data.startsWith("\\n")) {
926
+ data = "\\n" + data;
927
+ }
928
+ }
929
+ let newLineIndent = ">".repeat(this.blockquote ?? 0);
930
+ if (!(force === "end" && data && data[0] === ">") && this.blockquote) {
931
+ newLineIndent += " ";
932
+ }
933
+ if (this.pre) {
934
+ if (this.list.length === 0) {
935
+ newLineIndent += " ";
936
+ } else {
937
+ for (let i = 0; i < this.list.length + 1; i++) {
938
+ newLineIndent += " ";
939
+ }
940
+ }
941
+ data = data.replace(/\\n/g, \`\\n\${newLineIndent}\`);
942
+ }
943
+ if (this.startPre) {
944
+ this.startPre = false;
945
+ if (this.list.length > 0) {
946
+ data = data.trimStart();
947
+ }
948
+ }
949
+ if (this.start) {
950
+ this.space = 0;
951
+ this.p_p = 0;
952
+ this.start = false;
953
+ }
954
+ if (force === "end") {
955
+ this.p_p = 0;
956
+ this.out("\\n");
957
+ this.space = 0;
958
+ }
959
+ if (this.p_p) {
960
+ this.out((this.breakToggle + "\\n" + newLineIndent).repeat(this.p_p));
961
+ this.space = 0;
962
+ this.breakToggle = "";
963
+ }
964
+ if (this.space) {
965
+ if (!this.lastWasNewLine) {
966
+ this.out(" ");
967
+ }
968
+ this.space = 0;
969
+ }
970
+ if (this.a && force === "end") {
971
+ if (force === "end") {
972
+ this.out("\\n");
973
+ }
974
+ const newA = this.a.filter((link) => {
975
+ if (this.outCount > link.outcount) {
976
+ this.out(
977
+ " [" +
978
+ link.count +
979
+ "]: " +
980
+ new URL(link.href, this.baseurl).toString()
981
+ );
982
+ if (link.title) {
983
+ this.out(" (" + link.title + ")");
984
+ }
985
+ this.out("\\n");
986
+ return false;
987
+ }
988
+ return true;
989
+ });
990
+ if (this.a.length !== newA.length) {
991
+ this.out("\\n");
992
+ }
993
+ this.a = newA;
994
+ }
995
+ if (this.abbrList && force === "end") {
996
+ for (const [abbr, definition] of Object.entries(this.abbrList)) {
997
+ this.out("\\n *[" + abbr + "]: " + definition + "\\n");
998
+ }
999
+ }
1000
+ this.p_p = 0;
1001
+ this.out(data);
1002
+ this.outCount++;
1003
+ }
1004
+ out(string) {
1005
+ this.outTextList.push(string);
1006
+ if (string) {
1007
+ this.lastWasNewLine = string.charAt(string.length - 1) === "\\n";
1008
+ }
1009
+ }
1010
+ getResult() {
1011
+ this.processOutput("", 0, "end");
1012
+ this.outText = this.outTextList.join("");
1013
+ this.outText = this.outText.replace("&nbsp_place_holder;", " ");
1014
+ return this.outText;
1015
+ }
1016
+ getHeadingLevel(tag) {
1017
+ if (tag[0] === "h" && tag.length === 2) {
1018
+ try {
1019
+ const n = parseInt(tag[1]);
1020
+ if (!isNaN(n) && n >= 1 && n <= 9) {
1021
+ return n;
1022
+ }
1023
+ } catch (error) {
1024
+ return 0;
1025
+ }
1026
+ }
1027
+ return 0;
1028
+ }
1029
+ padding() {
1030
+ this.p_p = 2;
1031
+ }
1032
+ handleData(node) {
1033
+ if (this.maybeAutomaticLink) {
1034
+ const href = this.maybeAutomaticLink;
1035
+ if (
1036
+ href?.value === node.nodeValue &&
1037
+ this.absoluteUrlMatcher.test(href.value)
1038
+ ) {
1039
+ this.processOutput(\`<\${node.nodeValue}>\`);
1040
+ return;
1041
+ } else {
1042
+ this.processOutput("[");
1043
+ this.maybeAutomaticLink = null;
1044
+ }
1045
+ }
1046
+ if (!this.code && !this.pre && node.nodeValue) {
1047
+ const data = escapeMdSection(node.nodeValue);
1048
+ this.processOutput(data, 1);
1049
+ return;
1050
+ }
1051
+ this.processOutput(node.textContent || "", 1);
1052
+ }
1053
+ handleTag(node) {
1054
+ const tag = node.nodeName.toLowerCase();
1055
+ if (["head", "style", "script"].includes(tag)) {
1056
+ return;
1057
+ }
1058
+ if (this.getHeadingLevel(tag)) {
1059
+ this.padding();
1060
+ this.processOutput("#".repeat(this.getHeadingLevel(tag)) + " ");
1061
+ }
1062
+ if (tag == "br") this.processOutput(" \\n");
1063
+ if (tag == "hr") {
1064
+ this.padding();
1065
+ this.processOutput("---");
1066
+ this.padding();
1067
+ }
1068
+ if (tag == "blockquote") {
1069
+ this.padding();
1070
+ this.processOutput("> ", 0, 1);
1071
+ }
1072
+ }
1073
+ handleTagPrefix(node) {
1074
+ const nodeName = node.nodeName.toLowerCase();
1075
+ let attrs =
1076
+ node.nodeType === node.ELEMENT_NODE ? node.attributes : null;
1077
+ if (["table"].includes(nodeName)) {
1078
+ this.padding();
1079
+ }
1080
+ if (nodeName == "td" || nodeName == "th") {
1081
+ const index = Array.from(node.parentNode?.children ?? []).indexOf(
1082
+ node
1083
+ );
1084
+ let prefix = " ";
1085
+ if (index === 0) prefix = "| ";
1086
+ this.processOutput(prefix);
1087
+ // this.break();
1088
+ }
1089
+ if (["div", "p"].includes(nodeName)) {
1090
+ this.padding();
1091
+ }
1092
+ if (nodeName === "blockquote") {
1093
+ this.blockquote += 1;
1094
+ }
1095
+ if (nodeName === "pre") {
1096
+ this.pre = true;
1097
+ this.startPre = true;
1098
+ this.padding();
1099
+ }
1100
+ if (["code", "tt"].includes(nodeName)) {
1101
+ this.processOutput("\`");
1102
+ }
1103
+ if (["em", "i", "u"].includes(nodeName)) {
1104
+ this.processOutput(this.emphasis_mark);
1105
+ }
1106
+ if (["strong", "b"].includes(nodeName)) {
1107
+ this.processOutput(this.strong_mark);
1108
+ }
1109
+ if (["del", "strike", "s"].includes(nodeName)) {
1110
+ this.processOutput("<" + nodeName + ">");
1111
+ }
1112
+ if (nodeName === "abbr") {
1113
+ this.abbr_title = null;
1114
+ this.abbrData = "";
1115
+ const title = attrs && attrs.getNamedItem("title");
1116
+ if (attrs && title) {
1117
+ this.abbr_title = title.value;
1118
+ }
1119
+ }
1120
+ if (nodeName === "dl") {
1121
+ this.padding();
1122
+ }
1123
+ if (nodeName === "dd") {
1124
+ this.processOutput(" ");
1125
+ }
1126
+ if (nodeName == "a") {
1127
+ const href = attrs ? attrs.getNamedItem("href") : null;
1128
+ if (href && !(this.skipInternalLinks && href.value.startsWith("#"))) {
1129
+ this.aStack.push(attrs);
1130
+ this.maybeAutomaticLink = href;
1131
+ } else {
1132
+ this.aStack.push(null);
1133
+ }
1134
+ }
1135
+ if (nodeName === "img") {
1136
+ const src = attrs ? attrs.getNamedItem("src") : null;
1137
+ if (src) {
1138
+ node.setAttribute("href", src.value);
1139
+ attrs = node.attributes;
1140
+ const alt = attrs.getNamedItem("alt")?.value;
1141
+ this.processOutput("![" + escapeMd(alt ?? "") + "]");
1142
+ this.processOutput(
1143
+ "(" + escapeMd(attrs.getNamedItem("href")?.value ?? "") + ")"
1144
+ );
1145
+ }
1146
+ }
1147
+ if (["ul", "ol"].includes(nodeName)) {
1148
+ const listStyle = nodeName;
1149
+ const numberingStart = listNumberingStart(node.attributes);
1150
+ this.list.push({ name: listStyle, num: numberingStart });
1151
+ this.lastWasList = true;
1152
+ } else {
1153
+ this.lastWasList = false;
1154
+ }
1155
+ if (nodeName === "li") {
1156
+ let li;
1157
+ this.break();
1158
+ if (this.list.length > 0) {
1159
+ li = this.list[this.list.length - 1];
1160
+ } else {
1161
+ li = { name: "ul", num: 0 };
1162
+ }
1163
+ const nestCount = this.list.length;
1164
+ this.processOutput(" ".repeat(nestCount));
1165
+ if (li["name"] == "ul") this.processOutput("*" + " ");
1166
+ else if (li["name"] == "ol") {
1167
+ li["num"] += 1;
1168
+ this.processOutput(li["num"] + ". ");
1169
+ }
1170
+ this.start = true;
1171
+ }
1172
+ }
1173
+ handleTagSuffix(node) {
1174
+ const nodeName = node.nodeName.toLowerCase();
1175
+ if (nodeName === "blockquote") {
1176
+ this.blockquote -= 1;
1177
+ }
1178
+ if (nodeName == "td" || nodeName == "th") {
1179
+ this.processOutput(" |");
1180
+ }
1181
+ if (nodeName == "tr") {
1182
+ const cell = (content, node) => {
1183
+ const index = Array.from(node.parentNode.childNodes).indexOf(node);
1184
+ let prefix = " ";
1185
+ if (index === 0) prefix = "| ";
1186
+ return prefix + content + " |";
1187
+ };
1188
+ let borderCells = "";
1189
+ const alignMap = { left: ":--", right: "--:", center: ":-:" };
1190
+ if (isHeadingRow(node)) {
1191
+ for (let i = 0; i < node.children.length; i++) {
1192
+ let border = "---";
1193
+ const align = (
1194
+ node.children[i].getAttribute("align") || ""
1195
+ ).toLowerCase();
1196
+ if (align) border = alignMap[align] || border;
1197
+ borderCells += cell(border, node.childNodes[i]);
1198
+ }
1199
+ }
1200
+ this.processOutput(borderCells ? "\\n" + borderCells + "\\n" : "\\n");
1201
+ }
1202
+ if (nodeName === "pre") {
1203
+ this.pre = false;
1204
+ this.padding();
1205
+ }
1206
+ if (["code", "tt"].includes(nodeName)) {
1207
+ this.processOutput("\`");
1208
+ }
1209
+ if (["em", "i", "u"].includes(nodeName)) {
1210
+ this.processOutput(this.emphasis_mark);
1211
+ }
1212
+ if (["strong", "b"].includes(nodeName)) {
1213
+ this.processOutput(this.strong_mark);
1214
+ }
1215
+ if (["div", "p"].includes(nodeName)) {
1216
+ this.padding();
1217
+ }
1218
+ if (["del", "strike", "s"].includes(nodeName)) {
1219
+ this.processOutput("</" + nodeName + ">");
1220
+ }
1221
+ if (nodeName === "abbr") {
1222
+ if (this.abbr_title && this.abbrData) {
1223
+ this.abbrList[this.abbrData] = this.abbr_title;
1224
+ this.abbr_title = null;
1225
+ }
1226
+ this.abbrData = "";
1227
+ }
1228
+ if (nodeName === "dt") {
1229
+ this.break();
1230
+ }
1231
+ if (nodeName === "dd") {
1232
+ this.break();
1233
+ }
1234
+ if (nodeName === "a") {
1235
+ if (this.aStack.length > 0) {
1236
+ const a = this.aStack.pop();
1237
+ if (this.maybeAutomaticLink) {
1238
+ this.maybeAutomaticLink = null;
1239
+ } else if (a) {
1240
+ this.processOutput(
1241
+ \`](\${escapeMd(a.getNamedItem("href")?.value || "")})\`
1242
+ );
1243
+ }
1244
+ }
1245
+ }
1246
+ if (["ul", "ol"].includes(nodeName)) {
1247
+ if (this.list.length > 0) this.list.pop();
1248
+ this.lastWasList = true;
1249
+ } else {
1250
+ this.lastWasList = false;
1251
+ }
1252
+ if (nodeName === "li") {
1253
+ this.break();
1254
+ }
1255
+ }
1256
+ previousIndex(attrs) {
1257
+ // Returns the index of a certain set of attributes (of a link) in the
1258
+ // this.a list.
1259
+ // If the set of attributes is not found, returns null.
1260
+ const href = attrs.getNamedItem("href");
1261
+ if (!attrs.getNamedItem("href")) return null;
1262
+ let itemIndex = -1;
1263
+ for (const a of this.a ?? []) {
1264
+ itemIndex += 1;
1265
+ let match = false;
1266
+ if (a.getNamedItem("href") === href) {
1267
+ if (a.getNamedItem("title") || attrs.getNamedItem("title")) {
1268
+ if (
1269
+ a.getNamedItem("title") &&
1270
+ attrs.getNamedItem("title") &&
1271
+ a.getNamedItem("title") === attrs.getNamedItem("title")
1272
+ ) {
1273
+ match = true;
1274
+ }
1275
+ } else {
1276
+ match = true;
1277
+ }
1278
+ }
1279
+ if (match) return itemIndex;
1280
+ }
1281
+ return null;
1282
+ }
1283
+ handle(htmlElement) {
1284
+ // jsdom failed to parse hilton page due to invalid stylesheet
1285
+ // Nodes to be removed
1286
+ const filteredNodes = ["style", "script", "noscript"];
1287
+ for (const node of filteredNodes) {
1288
+ const nodeSelectors = htmlElement.querySelectorAll(node);
1289
+ nodeSelectors.forEach((nodeSelector) => {
1290
+ if (nodeSelector && nodeSelector.parentNode) {
1291
+ nodeSelector.parentNode.removeChild(nodeSelector);
1292
+ }
1293
+ });
1294
+ }
1295
+ // Get the cleaned-up HTML content
1296
+ const htmlContent = htmlElement.outerHTML;
1297
+ const parser = new DOMParser();
1298
+ const doc = parser.parseFromString(htmlContent, "text/html");
1299
+ const traverseDOM = (node) => {
1300
+ const tag = node.nodeName.toLowerCase();
1301
+ if (node.nodeType === node.TEXT_NODE) {
1302
+ const element = node;
1303
+ this.handleData(element);
1304
+ return;
1305
+ }
1306
+ if (node.nodeType === node.ELEMENT_NODE) {
1307
+ const element = node;
1308
+ this.handleTag(element);
1309
+ }
1310
+ if (!["head", "style", "script"].includes(tag)) {
1311
+ this.handleTagPrefix(node);
1312
+ node.childNodes.forEach((child) => traverseDOM(child));
1313
+ this.handleTagSuffix(node);
1314
+ }
1315
+ };
1316
+ traverseDOM(doc.documentElement);
1317
+ return this.getResult();
1318
+ }
1319
+ }
1320
+ const converter = new Html2Text();
1321
+ const result = converter.handle(element);
1322
+ return result;
1323
+ }
1324
+
1325
+ var node = {};
1326
+
1327
+ var htmlToMarkdownAST$1 = {};
1328
+
1329
+ var ElementNode = {};
1330
+
1331
+ Object.defineProperty(ElementNode, "__esModule", { value: true });
1332
+ ElementNode._Node = void 0;
1333
+ // this is by value copy of the global Node
1334
+ ElementNode._Node = {
1335
+ /** node is an element. */
1336
+ ELEMENT_NODE: 1,
1337
+ ATTRIBUTE_NODE: 2,
1338
+ /** node is a Text node. */
1339
+ TEXT_NODE: 3,
1340
+ /** node is a CDATASection node. */
1341
+ CDATA_SECTION_NODE: 4,
1342
+ ENTITY_REFERENCE_NODE: 5,
1343
+ ENTITY_NODE: 6,
1344
+ /** node is a ProcessingInstruction node. */
1345
+ PROCESSING_INSTRUCTION_NODE: 7,
1346
+ /** node is a Comment node. */
1347
+ COMMENT_NODE: 8,
1348
+ /** node is a document. */
1349
+ DOCUMENT_NODE: 9,
1350
+ /** node is a doctype. */
1351
+ DOCUMENT_TYPE_NODE: 10,
1352
+ /** node is a DocumentFragment node. */
1353
+ DOCUMENT_FRAGMENT_NODE: 11,
1354
+ NOTATION_NODE: 12,
1355
+ /** Set when node and other are not in the same tree. */
1356
+ DOCUMENT_POSITION_DISCONNECTED: 0x01,
1357
+ /** Set when other is preceding node. */
1358
+ DOCUMENT_POSITION_PRECEDING: 0x02,
1359
+ /** Set when other is following node. */
1360
+ DOCUMENT_POSITION_FOLLOWING: 0x04,
1361
+ /** Set when other is an ancestor of node. */
1362
+ DOCUMENT_POSITION_CONTAINS: 0x08,
1363
+ /** Set when other is a descendant of node. */
1364
+ DOCUMENT_POSITION_CONTAINED_BY: 0x10,
1365
+ DOCUMENT_POSITION_IMPLEMENTATION_SPECIFIC: 0x20,
1366
+ };
1367
+
1368
+ Object.defineProperty(htmlToMarkdownAST$1, "__esModule", { value: true });
1369
+ htmlToMarkdownAST$1.htmlToMarkdownAST = htmlToMarkdownAST;
1370
+ const ElementNode_1$1 = ElementNode;
1371
+ function htmlToMarkdownAST(element, options, indentLevel = 0) {
1372
+ let result = [];
1373
+ const debugLog = (message) => {
1374
+ if (options?.debug) {
1375
+ console.log(message);
1376
+ }
1377
+ };
1378
+ element.childNodes.forEach((childElement) => {
1379
+ const overriddenElementProcessing = options?.overrideElementProcessing?.(
1380
+ childElement,
1381
+ options,
1382
+ indentLevel
1383
+ );
1384
+ if (overriddenElementProcessing) {
1385
+ debugLog(\`Element Processing Overridden: '\${childElement.nodeType}'\`);
1386
+ result.push(...overriddenElementProcessing);
1387
+ } else if (childElement.nodeType === ElementNode_1$1._Node.TEXT_NODE) {
1388
+ const textContent = escapeMarkdownCharacters(
1389
+ childElement.textContent?.trim() ?? ""
1390
+ );
1391
+ if (textContent && !!childElement.textContent) {
1392
+ debugLog(\`Text Node: '\${textContent}'\`);
1393
+ // preserve whitespaces when text childElement is not empty
1394
+ result.push({
1395
+ type: "text",
1396
+ content: childElement.textContent?.trim(),
1397
+ });
1398
+ }
1399
+ } else if (childElement.nodeType === ElementNode_1$1._Node.ELEMENT_NODE) {
1400
+ const elem = childElement;
1401
+ if (/^h[1-6]$/i.test(elem.tagName)) {
1402
+ const level = parseInt(elem.tagName.substring(1));
1403
+ const content = escapeMarkdownCharacters(
1404
+ elem.textContent || ""
1405
+ ).trim();
1406
+ if (content) {
1407
+ debugLog(\`Heading \${level}: '\${elem.textContent}'\`);
1408
+ result.push({ type: "heading", level, content });
1409
+ }
1410
+ } else if (elem.tagName.toLowerCase() === "p") {
1411
+ debugLog("Paragraph");
1412
+ result.push(...htmlToMarkdownAST(elem, options));
1413
+ // Add a new line after the paragraph
1414
+ result.push({ type: "text", content: "\\n\\n" });
1415
+ } else if (elem.tagName.toLowerCase() === "a") {
1416
+ debugLog(
1417
+ \`Link: '\${elem.getAttribute("href")}' with text '\${
1418
+ elem.textContent
1419
+ }'\`
1420
+ );
1421
+ // Check if the href is a data URL for an image
1422
+ if (
1423
+ typeof elem.getAttribute("href") === "string" &&
1424
+ elem.getAttribute("href").startsWith("data:image")
1425
+ ) {
1426
+ // If it's a data URL for an image, skip this link
1427
+ result.push({
1428
+ type: "link",
1429
+ href: "-",
1430
+ content: htmlToMarkdownAST(elem, options),
1431
+ });
1432
+ } else {
1433
+ // Process the link as usual
1434
+ let href = elem.getAttribute("href");
1435
+ if (typeof href === "string") {
1436
+ href =
1437
+ options?.websiteDomain && href.startsWith(options.websiteDomain)
1438
+ ? href.substring(options.websiteDomain.length)
1439
+ : href;
1440
+ } else {
1441
+ href = "#"; // Use a default value when href is not a string
1442
+ }
1443
+ // if all children are text,
1444
+ if (
1445
+ Array.from(elem.childNodes).every(
1446
+ (_) => _.nodeType === ElementNode_1$1._Node.TEXT_NODE
1447
+ )
1448
+ ) {
1449
+ result.push({
1450
+ type: "link",
1451
+ href: href,
1452
+ content: [
1453
+ { type: "text", content: elem.textContent?.trim() ?? "" },
1454
+ ],
1455
+ });
1456
+ } else {
1457
+ result.push({
1458
+ type: "link",
1459
+ href: href,
1460
+ content: htmlToMarkdownAST(elem, options),
1461
+ });
1462
+ }
1463
+ }
1464
+ } else if (elem.tagName.toLowerCase() === "img") {
1465
+ debugLog(\`Image: src='\${elem.src}', alt='\${elem.alt}'\`);
1466
+ if (elem.src?.startsWith("data:image")) {
1467
+ result.push({
1468
+ type: "image",
1469
+ src: "-",
1470
+ alt: escapeMarkdownCharacters(elem.alt),
1471
+ });
1472
+ } else {
1473
+ const src =
1474
+ options?.websiteDomain &&
1475
+ elem.src?.startsWith(options.websiteDomain)
1476
+ ? elem.src?.substring(options.websiteDomain.length)
1477
+ : elem.src;
1478
+ result.push({
1479
+ type: "image",
1480
+ src,
1481
+ alt: escapeMarkdownCharacters(elem.alt),
1482
+ });
1483
+ }
1484
+ } else if (elem.tagName.toLowerCase() === "video") {
1485
+ debugLog(
1486
+ \`Video: src='\${elem.src}', poster='\${elem.poster}', controls='\${elem.controls}'\`
1487
+ );
1488
+ result.push({
1489
+ type: "video",
1490
+ src: elem.src,
1491
+ poster: escapeMarkdownCharacters(elem.poster),
1492
+ controls: elem.controls,
1493
+ });
1494
+ } else if (
1495
+ elem.tagName.toLowerCase() === "ul" ||
1496
+ elem.tagName.toLowerCase() === "ol"
1497
+ ) {
1498
+ debugLog(
1499
+ \`\${
1500
+ elem.tagName.toLowerCase() === "ul" ? "Unordered" : "Ordered"
1501
+ } List\`
1502
+ );
1503
+ result.push({
1504
+ type: "list",
1505
+ ordered: elem.tagName.toLowerCase() === "ol",
1506
+ items: Array.from(elem.children).map((li) => ({
1507
+ type: "listItem",
1508
+ content: htmlToMarkdownAST(li, options, indentLevel + 1),
1509
+ })),
1510
+ });
1511
+ } else if (elem.tagName.toLowerCase() === "br") {
1512
+ debugLog("Line Break");
1513
+ result.push({ type: "text", content: "\\n" });
1514
+ } else if (elem.tagName.toLowerCase() === "table") {
1515
+ debugLog("Table");
1516
+ let colIds = [];
1517
+ if (options?.enableTableColumnTracking) {
1518
+ // Generate unique column IDs
1519
+ const headerCells = Array.from(elem.querySelectorAll("th, td"));
1520
+ headerCells.forEach((_, index) => {
1521
+ colIds.push(\`col-\${index}\`);
1522
+ });
1523
+ }
1524
+ const tableRows = Array.from(elem.querySelectorAll("tr"));
1525
+ const markdownTableRows = tableRows.map((row) => {
1526
+ let columnIndex = 0;
1527
+ const cells = Array.from(row.querySelectorAll("th, td")).map(
1528
+ (cell) => {
1529
+ const colspan = parseInt(
1530
+ cell.getAttribute("colspan") || "1",
1531
+ 10
1532
+ );
1533
+ const rowspan = parseInt(
1534
+ cell.getAttribute("rowspan") || "1",
1535
+ 10
1536
+ );
1537
+ const cellNode = {
1538
+ type: "tableCell",
1539
+ content:
1540
+ cell.nodeType === ElementNode_1$1._Node.TEXT_NODE
1541
+ ? escapeMarkdownCharacters(cell.textContent?.trim() ?? "")
1542
+ : htmlToMarkdownAST(cell, options, indentLevel + 1),
1543
+ colId: colIds[columnIndex],
1544
+ colspan: colspan > 1 ? colspan : undefined,
1545
+ rowspan: rowspan > 1 ? rowspan : undefined,
1546
+ };
1547
+ columnIndex += colspan;
1548
+ return cellNode;
1549
+ }
1550
+ );
1551
+ return { type: "tableRow", cells };
1552
+ });
1553
+ if (markdownTableRows.length > 0) {
1554
+ // Check if the first row contains header cells
1555
+ const hasHeaders = tableRows[0].querySelector("th") !== null;
1556
+ if (hasHeaders) {
1557
+ // Create a header separator row
1558
+ const headerSeparatorCells = Array.from(
1559
+ tableRows[0].querySelectorAll("th, td")
1560
+ ).map(() => ({
1561
+ type: "tableCell",
1562
+ content: "---",
1563
+ colId: undefined,
1564
+ colspan: undefined,
1565
+ rowspan: undefined,
1566
+ }));
1567
+ const headerSeparatorRow = {
1568
+ type: "tableRow",
1569
+ cells: headerSeparatorCells,
1570
+ };
1571
+ markdownTableRows.splice(1, 0, headerSeparatorRow);
1572
+ }
1573
+ }
1574
+ result.push({ type: "table", rows: markdownTableRows, colIds });
1575
+ } else if (
1576
+ elem.tagName.toLowerCase() === "head" &&
1577
+ !!options?.includeMetaData
1578
+ ) {
1579
+ const node = {
1580
+ type: "meta",
1581
+ content: {
1582
+ standard: {},
1583
+ openGraph: {},
1584
+ twitter: {},
1585
+ },
1586
+ };
1587
+ elem.querySelectorAll("title").forEach((titleElem) => {
1588
+ node.content.standard["title"] = escapeMarkdownCharacters(
1589
+ titleElem.text
1590
+ );
1591
+ });
1592
+ // Extract meta tags
1593
+ const metaTags = elem.querySelectorAll("meta");
1594
+ const nonSemanticTagNames = [
1595
+ "viewport",
1596
+ "referrer",
1597
+ "Content-Security-Policy",
1598
+ ];
1599
+ metaTags.forEach((metaTag) => {
1600
+ const name = metaTag.getAttribute("name");
1601
+ const property = metaTag.getAttribute("property");
1602
+ const content = metaTag.getAttribute("content");
1603
+ if (property && property.startsWith("og:") && content) {
1604
+ if (options.includeMetaData === "extended") {
1605
+ node.content.openGraph[property.substring(3)] = content;
1606
+ }
1607
+ } else if (name && name.startsWith("twitter:") && content) {
1608
+ if (options.includeMetaData === "extended") {
1609
+ node.content.twitter[name.substring(8)] = content;
1610
+ }
1611
+ } else if (name && !nonSemanticTagNames.includes(name) && content) {
1612
+ node.content.standard[name] = content;
1613
+ }
1614
+ });
1615
+ // Extract JSON-LD data
1616
+ if (options.includeMetaData === "extended") {
1617
+ const jsonLdData = [];
1618
+ const jsonLDScripts = elem.querySelectorAll(
1619
+ 'script[type="application/ld+json"]'
1620
+ );
1621
+ jsonLDScripts.forEach((script) => {
1622
+ try {
1623
+ const jsonContent = script.textContent;
1624
+ if (jsonContent) {
1625
+ const parsedData = JSON.parse(jsonContent);
1626
+ jsonLdData.push(parsedData);
1627
+ }
1628
+ } catch (error) {
1629
+ console.error("Failed to parse JSON-LD", error);
1630
+ }
1631
+ });
1632
+ node.content.jsonLd = jsonLdData;
1633
+ }
1634
+ result.push(node);
1635
+ } else {
1636
+ const content = escapeMarkdownCharacters(elem.textContent || "");
1637
+ switch (elem.tagName.toLowerCase()) {
1638
+ case "noscript":
1639
+ case "script":
1640
+ case "style":
1641
+ case "html":
1642
+ // blackhole..
1643
+ break;
1644
+ case "strong":
1645
+ case "b":
1646
+ if (content) {
1647
+ debugLog(\`Bold: '\${content}'\`);
1648
+ result.push({
1649
+ type: "bold",
1650
+ content: htmlToMarkdownAST(elem, options, indentLevel + 1),
1651
+ });
1652
+ }
1653
+ break;
1654
+ case "em":
1655
+ case "i":
1656
+ if (content) {
1657
+ debugLog(\`Italic: '\${content}'\`);
1658
+ result.push({
1659
+ type: "italic",
1660
+ content: htmlToMarkdownAST(elem, options, indentLevel + 1),
1661
+ });
1662
+ }
1663
+ break;
1664
+ case "s":
1665
+ case "strike":
1666
+ if (content) {
1667
+ debugLog(\`Strikethrough: '\${content}'\`);
1668
+ result.push({
1669
+ type: "strikethrough",
1670
+ content: htmlToMarkdownAST(elem, options, indentLevel + 1),
1671
+ });
1672
+ }
1673
+ break;
1674
+ case "code":
1675
+ if (content) {
1676
+ // Handling inline code differently
1677
+ const isCodeBlock =
1678
+ elem.parentNode &&
1679
+ elem.parentNode.nodeName.toLowerCase() === "pre";
1680
+ debugLog(
1681
+ \`\${isCodeBlock ? "Code Block" : "Inline Code"}: '\${content}'\`
1682
+ );
1683
+ const languageClass = elem.className
1684
+ ?.split(" ")
1685
+ .find((cls) => cls.startsWith("language-"));
1686
+ const language = languageClass
1687
+ ? languageClass.replace("language-", "")
1688
+ : "";
1689
+ result.push({
1690
+ type: "code",
1691
+ content: elem.textContent?.trim() ?? "",
1692
+ language,
1693
+ inline: !isCodeBlock,
1694
+ });
1695
+ }
1696
+ break;
1697
+ case "blockquote":
1698
+ debugLog(\`Blockquote\`);
1699
+ result.push({
1700
+ type: "blockquote",
1701
+ content: htmlToMarkdownAST(elem, options),
1702
+ });
1703
+ break;
1704
+ case "article":
1705
+ case "aside":
1706
+ case "details":
1707
+ case "figcaption":
1708
+ case "figure":
1709
+ case "footer":
1710
+ case "header":
1711
+ case "main":
1712
+ case "mark":
1713
+ case "nav":
1714
+ case "section":
1715
+ case "summary":
1716
+ case "time":
1717
+ debugLog(\`Semantic HTML Element: '\${elem.tagName}'\`);
1718
+ result.push({
1719
+ type: "semanticHtml",
1720
+ htmlType: elem.tagName.toLowerCase(),
1721
+ content: htmlToMarkdownAST(elem, options),
1722
+ });
1723
+ break;
1724
+ default:
1725
+ const unhandledElementProcessing =
1726
+ options?.processUnhandledElement?.(elem, options, indentLevel);
1727
+ if (unhandledElementProcessing) {
1728
+ debugLog(\`Processing Unhandled Element: '\${elem.tagName}'\`);
1729
+ result.push(...unhandledElementProcessing);
1730
+ } else {
1731
+ debugLog(\`Generic HTMLElement: '\${elem.tagName}'\`);
1732
+ result.push(
1733
+ ...htmlToMarkdownAST(elem, options, indentLevel + 1)
1734
+ );
1735
+ }
1736
+ break;
1737
+ }
1738
+ }
1739
+ }
1740
+ });
1741
+ return result;
1742
+ }
1743
+ function escapeMarkdownCharacters(text, isInlineCode = false) {
1744
+ if (isInlineCode || !text?.trim()) {
1745
+ // In inline code, we don't escape any characters
1746
+ return text;
1747
+ }
1748
+ // First, replace special HTML characters with their entity equivalents
1749
+ let escapedText = text
1750
+ .replace(/&/g, "&amp;") // Replace & first
1751
+ .replace(/</g, "&lt;")
1752
+ .replace(/>/g, "&gt;");
1753
+ // Then escape characters that have special meaning in Markdown
1754
+ escapedText = escapedText.replace(/([\\\\\`*_{}[\\]#+!|])/g, "\\\\$1");
1755
+ return escapedText;
1756
+ }
1757
+
1758
+ var markdownASTToString = {};
1759
+
1760
+ var hasRequiredMarkdownASTToString;
1761
+
1762
+ function requireMarkdownASTToString() {
1763
+ if (hasRequiredMarkdownASTToString) return markdownASTToString;
1764
+ hasRequiredMarkdownASTToString = 1;
1765
+ Object.defineProperty(markdownASTToString, "__esModule", { value: true });
1766
+ markdownASTToString.markdownASTToString = markdownASTToString$1;
1767
+ const index_1 = requireNode();
1768
+ function markdownASTToString$1(nodes, options, indentLevel = 0) {
1769
+ let markdownString = "";
1770
+ markdownString += markdownMetaASTToString(nodes, options, indentLevel);
1771
+ markdownString += markdownContentASTToString(nodes, options, indentLevel);
1772
+ return markdownString;
1773
+ }
1774
+ function markdownMetaASTToString(nodes, options, indentLevel = 0) {
1775
+ let markdownString = "";
1776
+ if (options?.includeMetaData) {
1777
+ // include meta-data
1778
+ markdownString += "---\\n";
1779
+ const node = (0, index_1.findInMarkdownAST)(
1780
+ nodes,
1781
+ (_) => _.type === "meta"
1782
+ );
1783
+ if (node?.type === "meta") {
1784
+ if (node.content.standard) {
1785
+ Object.keys(node.content.standard).forEach((key) => {
1786
+ markdownString += \`\${key}: "\${node.content.standard[key]}"\\n\`;
1787
+ });
1788
+ }
1789
+ if (options.includeMetaData === "extended") {
1790
+ if (node.content.openGraph) {
1791
+ if (Object.keys(node.content.openGraph).length > 0) {
1792
+ markdownString += "openGraph:\\n";
1793
+ for (const [key, value] of Object.entries(
1794
+ node.content.openGraph
1795
+ )) {
1796
+ markdownString += \` \${key}: "\${value}"\\n\`;
1797
+ }
1798
+ }
1799
+ }
1800
+ if (node.content.twitter) {
1801
+ if (Object.keys(node.content.twitter).length > 0) {
1802
+ markdownString += "twitter:\\n";
1803
+ for (const [key, value] of Object.entries(
1804
+ node.content.twitter
1805
+ )) {
1806
+ markdownString += \` \${key}: "\${value}"\\n\`;
1807
+ }
1808
+ }
1809
+ }
1810
+ if (node.content.jsonLd && node.content.jsonLd.length > 0) {
1811
+ markdownString += "schema:\\n";
1812
+ node.content.jsonLd.forEach((item) => {
1813
+ const {
1814
+ "@context": jldContext,
1815
+ "@type": jldType,
1816
+ ...semanticData
1817
+ } = item;
1818
+ markdownString += \` \${jldType ?? "(unknown type)"}:\\n\`;
1819
+ Object.keys(semanticData).forEach((key) => {
1820
+ markdownString += \` \${key}: \${JSON.stringify(
1821
+ semanticData[key]
1822
+ )}\\n\`;
1823
+ });
1824
+ });
1825
+ }
1826
+ }
1827
+ }
1828
+ markdownString += "---\\n\\n";
1829
+ }
1830
+ return markdownString;
1831
+ }
1832
+ function markdownContentASTToString(nodes, options, indentLevel = 0) {
1833
+ let markdownString = "";
1834
+ nodes.forEach((node) => {
1835
+ const indent = " ".repeat(indentLevel * 2); // Adjust the multiplier for different indent sizes
1836
+ const nodeRenderingOverride = options?.overrideNodeRenderer?.(
1837
+ node,
1838
+ options,
1839
+ indentLevel
1840
+ );
1841
+ if (nodeRenderingOverride) {
1842
+ markdownString += nodeRenderingOverride;
1843
+ } else {
1844
+ switch (node.type) {
1845
+ case "text":
1846
+ case "bold":
1847
+ case "italic":
1848
+ case "strikethrough":
1849
+ case "link":
1850
+ let content = node.content; // might be a nodes array but we take care of that below
1851
+ if (Array.isArray(node.content)) {
1852
+ content = markdownContentASTToString(
1853
+ node.content,
1854
+ options,
1855
+ indentLevel
1856
+ );
1857
+ }
1858
+ const isMarkdownStringNotEmpty = markdownString.length > 0;
1859
+ const isFirstCharOfContentWhitespace = /\\s/.test(
1860
+ content.slice(0, 1)
1861
+ );
1862
+ const isLastCharOfMarkdownWhitespace = /\\s/.test(
1863
+ markdownString.slice(-1)
1864
+ );
1865
+ const isContentPunctuation =
1866
+ content.length === 1 && /^[.,!?;:]/.test(content);
1867
+ if (
1868
+ isMarkdownStringNotEmpty &&
1869
+ !isContentPunctuation &&
1870
+ !isFirstCharOfContentWhitespace &&
1871
+ !isLastCharOfMarkdownWhitespace
1872
+ ) {
1873
+ markdownString += " ";
1874
+ }
1875
+ if (node.type === "text") {
1876
+ markdownString += \`\${indent}\${content}\`;
1877
+ } else {
1878
+ if (node.type === "bold") {
1879
+ markdownString += \`**\${content}**\`;
1880
+ } else if (node.type === "italic") {
1881
+ markdownString += \`*\${content}*\`;
1882
+ } else if (node.type === "strikethrough") {
1883
+ markdownString += \`~~\${content}~~\`;
1884
+ } else if (node.type === "link") {
1885
+ // check if the link contains only text
1886
+ if (
1887
+ node.content.length === 1 &&
1888
+ node.content[0].type === "text"
1889
+ ) {
1890
+ // use native markdown syntax for text-only links
1891
+ markdownString += \`[\${content}](\${encodeURI(node.href)})\`;
1892
+ } else {
1893
+ // Use HTML <a> tag for links with rich content
1894
+ markdownString += \`<a href="\${node.href}">\${content}</a>\`;
1895
+ }
1896
+ }
1897
+ }
1898
+ break;
1899
+ case "heading":
1900
+ const isEndsWithNewLine = markdownString.slice(-1) === "\\n";
1901
+ if (!isEndsWithNewLine) {
1902
+ markdownString += "\\n";
1903
+ }
1904
+ markdownString += \`\${"#".repeat(node.level)} \${node.content}\\n\\n\`;
1905
+ break;
1906
+ case "image":
1907
+ if (!node.alt?.trim() || !!node.src?.trim()) {
1908
+ markdownString += \`![\${node.alt || ""}](\${node.src})\`;
1909
+ }
1910
+ break;
1911
+ case "list":
1912
+ node.items.forEach((item, i) => {
1913
+ const listItemPrefix = node.ordered ? \`\${i + 1}.\` : "-";
1914
+ const contents = markdownContentASTToString(
1915
+ item.content,
1916
+ options,
1917
+ indentLevel + 1
1918
+ ).trim();
1919
+ if (markdownString.slice(-1) !== "\\n") {
1920
+ markdownString += "\\n";
1921
+ }
1922
+ if (contents) {
1923
+ markdownString += \`\${indent}\${listItemPrefix} \${contents}\\n\`;
1924
+ }
1925
+ });
1926
+ markdownString += "\\n";
1927
+ break;
1928
+ case "video":
1929
+ markdownString += \`\\n![Video](\${node.src})\\n\`;
1930
+ if (node.poster) {
1931
+ markdownString += \`![Poster](\${node.poster})\\n\`;
1932
+ }
1933
+ if (node.controls) {
1934
+ markdownString += \`Controls: \${node.controls}\\n\`;
1935
+ }
1936
+ markdownString += "\\n";
1937
+ break;
1938
+ case "table":
1939
+ const maxColumns = Math.max(
1940
+ ...node.rows.map((row) =>
1941
+ row.cells.reduce((sum, cell) => sum + (cell.colspan || 1), 0)
1942
+ )
1943
+ );
1944
+ node.rows.forEach((row) => {
1945
+ let currentColumn = 0;
1946
+ row.cells.forEach((cell) => {
1947
+ let cellContent =
1948
+ typeof cell.content === "string"
1949
+ ? cell.content
1950
+ : markdownContentASTToString(
1951
+ cell.content,
1952
+ options,
1953
+ indentLevel + 1
1954
+ ).trim();
1955
+ if (cell.colId) {
1956
+ cellContent += \` <!-- \${cell.colId} -->\`;
1957
+ }
1958
+ if (cell.colspan && cell.colspan > 1) {
1959
+ cellContent += \` <!-- colspan: \${cell.colspan} -->\`;
1960
+ }
1961
+ if (cell.rowspan && cell.rowspan > 1) {
1962
+ cellContent += \` <!-- rowspan: \${cell.rowspan} -->\`;
1963
+ }
1964
+ markdownString += \`| \${cellContent} \`;
1965
+ currentColumn += cell.colspan || 1;
1966
+ // Add empty cells for colspan
1967
+ for (let i = 1; i < (cell.colspan || 1); i++) {
1968
+ markdownString += "| ";
1969
+ }
1970
+ });
1971
+ // Fill remaining columns with empty cells
1972
+ while (currentColumn < maxColumns) {
1973
+ markdownString += "| ";
1974
+ currentColumn++;
1975
+ }
1976
+ markdownString += "|\\n";
1977
+ });
1978
+ markdownString += "\\n";
1979
+ break;
1980
+ case "code":
1981
+ if (node.inline) {
1982
+ const isLsatWhitespace = /\\s/.test(markdownString.slice(-1));
1983
+ if (!isLsatWhitespace) {
1984
+ markdownString += " ";
1985
+ }
1986
+ markdownString += \`\\\`\${node.content}\\\`\`;
1987
+ } else {
1988
+ // For code blocks, we do not escape characters and preserve formatting
1989
+ markdownString += "\\n\`\`\`" + (node.language ?? "") + "\\n";
1990
+ markdownString += \`\${node.content}\\n\`;
1991
+ markdownString += "\`\`\`\\n\\n";
1992
+ }
1993
+ break;
1994
+ case "blockquote":
1995
+ markdownString += \`> \${markdownContentASTToString(
1996
+ node.content,
1997
+ options
1998
+ ).trim()}\\n\\n\`;
1999
+ break;
2000
+ case "meta":
2001
+ // already handled
2002
+ break;
2003
+ case "semanticHtml":
2004
+ switch (node.htmlType) {
2005
+ case "article":
2006
+ markdownString +=
2007
+ "\\n\\n" + markdownContentASTToString(node.content, options);
2008
+ break;
2009
+ case "summary":
2010
+ case "time":
2011
+ case "aside":
2012
+ case "nav":
2013
+ case "figcaption":
2014
+ case "main":
2015
+ case "mark":
2016
+ case "header":
2017
+ case "footer":
2018
+ case "details":
2019
+ case "figure":
2020
+ markdownString +=
2021
+ \`\\n\\n<-\${node.htmlType}->\\n\` +
2022
+ markdownContentASTToString(node.content, options) +
2023
+ \`\\n\\n</-\${node.htmlType}->\\n\`;
2024
+ break;
2025
+ case "section":
2026
+ markdownString += "---\\n\\n";
2027
+ markdownString += markdownContentASTToString(
2028
+ node.content,
2029
+ options
2030
+ );
2031
+ markdownString += "\\n\\n";
2032
+ markdownString += "---\\n\\n";
2033
+ break;
2034
+ }
2035
+ break;
2036
+ case "custom":
2037
+ const customNodeRendering = options?.renderCustomNode?.(
2038
+ node,
2039
+ options,
2040
+ indentLevel
2041
+ );
2042
+ if (customNodeRendering) {
2043
+ markdownString += customNodeRendering;
2044
+ }
2045
+ break;
2046
+ }
2047
+ }
2048
+ });
2049
+ return markdownString;
2050
+ }
2051
+ return markdownASTToString;
2052
+ }
2053
+
2054
+ var domUtils = {};
2055
+
2056
+ Object.defineProperty(domUtils, "__esModule", { value: true });
2057
+ domUtils.findMainContent = findMainContent;
2058
+ domUtils.wrapMainContent = wrapMainContent;
2059
+ domUtils.isElementVisible = isElementVisible;
2060
+ domUtils.getVisibleText = getVisibleText;
2061
+ const ElementNode_1 = ElementNode;
2062
+ const debugMessage = (message) => {};
2063
+ /**
2064
+ * Attempts to find the main content of a web page.
2065
+ * @param document The Document object to search.
2066
+ * @returns The Element containing the main content, or the body if no main content is found.
2067
+ */
2068
+ function findMainContent(document) {
2069
+ const mainElement = document.querySelector("main");
2070
+ if (mainElement) {
2071
+ return mainElement;
2072
+ }
2073
+ if (!document.body) {
2074
+ return document.documentElement;
2075
+ }
2076
+ return detectMainContent(document.body);
2077
+ }
2078
+ function wrapMainContent(mainContentElement, document) {
2079
+ if (mainContentElement.tagName.toLowerCase() !== "main") {
2080
+ const mainElement = document.createElement("main");
2081
+ mainContentElement.before(mainElement);
2082
+ mainElement.appendChild(mainContentElement);
2083
+ mainElement.id = "detected-main-content";
2084
+ }
2085
+ }
2086
+ function detectMainContent(rootElement) {
2087
+ const candidates = [];
2088
+ const minScore = 20;
2089
+ collectCandidates(rootElement, candidates, minScore);
2090
+ if (candidates.length === 0) {
2091
+ return rootElement;
2092
+ }
2093
+ candidates.sort((a, b) => calculateScore(b) - calculateScore(a));
2094
+ let bestIndependentCandidate = candidates[0];
2095
+ for (let i = 1; i < candidates.length; i++) {
2096
+ if (
2097
+ !candidates.some(
2098
+ (otherCandidate, j) =>
2099
+ j !== i && otherCandidate.contains(candidates[i])
2100
+ )
2101
+ ) {
2102
+ if (
2103
+ calculateScore(candidates[i]) >
2104
+ calculateScore(bestIndependentCandidate)
2105
+ ) {
2106
+ bestIndependentCandidate = candidates[i];
2107
+ debugMessage(
2108
+ \`New best independent candidate found: \${elementToString(
2109
+ bestIndependentCandidate
2110
+ )}\`
2111
+ );
2112
+ }
2113
+ }
2114
+ }
2115
+ debugMessage(
2116
+ \`Final main content candidate: \${elementToString(
2117
+ bestIndependentCandidate
2118
+ )}\`
2119
+ );
2120
+ return bestIndependentCandidate;
2121
+ }
2122
+ function elementToString(element) {
2123
+ if (!element) {
2124
+ return "No element";
2125
+ }
2126
+ return \`\${element.tagName}#\${element.id || "no-id"}.\${Array.from(
2127
+ element.classList
2128
+ ).join(".")}\`;
2129
+ }
2130
+ function collectCandidates(element, candidates, minScore) {
2131
+ const score = calculateScore(element);
2132
+ if (score >= minScore) {
2133
+ candidates.push(element);
2134
+ debugMessage(
2135
+ \`Candidate found: \${elementToString(element)}, score: \${score}\`
2136
+ );
2137
+ }
2138
+ Array.from(element.children).forEach((child) => {
2139
+ collectCandidates(child, candidates, minScore);
2140
+ });
2141
+ }
2142
+ function calculateScore(element) {
2143
+ let score = 0;
2144
+ let scoreLog = [];
2145
+ // High impact attributes
2146
+ const highImpactAttributes = [
2147
+ "article",
2148
+ "content",
2149
+ "main-container",
2150
+ "main",
2151
+ "main-content",
2152
+ ];
2153
+ highImpactAttributes.forEach((attr) => {
2154
+ if (element.classList.contains(attr) || element.id.includes(attr)) {
2155
+ score += 10;
2156
+ scoreLog.push(
2157
+ \`High impact attribute found: \${attr}, score increased by 10\`
2158
+ );
2159
+ }
2160
+ });
2161
+ // High impact tags
2162
+ const highImpactTags = ["article", "main", "section"];
2163
+ if (highImpactTags.includes(element.tagName.toLowerCase())) {
2164
+ score += 5;
2165
+ scoreLog.push(
2166
+ \`High impact tag found: \${element.tagName}, score increased by 5\`
2167
+ );
2168
+ }
2169
+ // Paragraph count
2170
+ const paragraphCount = element.getElementsByTagName("p").length;
2171
+ const paragraphScore = Math.min(paragraphCount, 5);
2172
+ if (paragraphScore > 0) {
2173
+ score += paragraphScore;
2174
+ scoreLog.push(
2175
+ \`Paragraph count: \${paragraphCount}, score increased by \${paragraphScore}\`
2176
+ );
2177
+ }
2178
+ // Text content length
2179
+ const textContentLength = element.textContent?.trim().length || 0;
2180
+ if (textContentLength > 200) {
2181
+ const textScore = Math.min(Math.floor(textContentLength / 200), 5);
2182
+ score += textScore;
2183
+ scoreLog.push(
2184
+ \`Text content length: \${textContentLength}, score increased by \${textScore}\`
2185
+ );
2186
+ }
2187
+ // Link density
2188
+ const linkDensity = calculateLinkDensity(element);
2189
+ if (linkDensity < 0.3) {
2190
+ score += 5;
2191
+ scoreLog.push(
2192
+ \`Link density: \${linkDensity.toFixed(2)}, score increased by 5\`
2193
+ );
2194
+ }
2195
+ // Data attributes
2196
+ if (
2197
+ element.hasAttribute("data-main") ||
2198
+ element.hasAttribute("data-content")
2199
+ ) {
2200
+ score += 10;
2201
+ scoreLog.push(
2202
+ "Data attribute for main content found, score increased by 10"
2203
+ );
2204
+ }
2205
+ // Role attribute
2206
+ if (element.getAttribute("role")?.includes("main")) {
2207
+ score += 10;
2208
+ scoreLog.push(
2209
+ "Role attribute indicating main content found, score increased by 10"
2210
+ );
2211
+ }
2212
+ if (scoreLog.length > 0) {
2213
+ debugMessage(\`Scoring for \${elementToString(element)}:\`);
2214
+ }
2215
+ return score;
2216
+ }
2217
+ function calculateLinkDensity(element) {
2218
+ const linkLength = Array.from(element.getElementsByTagName("a")).reduce(
2219
+ (sum, link) => sum + (link.textContent?.length || 0),
2220
+ 0
2221
+ );
2222
+ const textLength = element.textContent?.length || 1; // Avoid division by zero
2223
+ return linkLength / textLength;
2224
+ }
2225
+ function isElementVisible(element) {
2226
+ if (!(element instanceof HTMLElement)) {
2227
+ return true; // Non-HTMLElements are considered visible
2228
+ }
2229
+ const style = window.getComputedStyle(element);
2230
+ return (
2231
+ style.display !== "none" &&
2232
+ style.visibility !== "hidden" &&
2233
+ style.opacity !== "0"
2234
+ );
2235
+ }
2236
+ function getVisibleText(element) {
2237
+ if (!isElementVisible(element)) {
2238
+ return "";
2239
+ }
2240
+ let text = "";
2241
+ for (const child of Array.from(element.childNodes)) {
2242
+ if (child.nodeType === ElementNode_1._Node.TEXT_NODE) {
2243
+ text += child.textContent;
2244
+ } else if (child.nodeType === ElementNode_1._Node.ELEMENT_NODE) {
2245
+ text += getVisibleText(child);
2246
+ }
2247
+ }
2248
+ return text.trim();
2249
+ }
2250
+
2251
+ var urlUtils = {};
2252
+
2253
+ Object.defineProperty(urlUtils, "__esModule", { value: true });
2254
+ urlUtils.refifyUrls = refifyUrls;
2255
+ const mediaSuffixes = [
2256
+ "jpeg",
2257
+ "jpg",
2258
+ "png",
2259
+ "gif",
2260
+ "bmp",
2261
+ "tiff",
2262
+ "tif",
2263
+ "svg",
2264
+ "webp",
2265
+ "ico",
2266
+ "avi",
2267
+ "mov",
2268
+ "mp4",
2269
+ "mkv",
2270
+ "flv",
2271
+ "wmv",
2272
+ "webm",
2273
+ "mpeg",
2274
+ "mpg",
2275
+ "mp3",
2276
+ "wav",
2277
+ "aac",
2278
+ "ogg",
2279
+ "flac",
2280
+ "m4a",
2281
+ "pdf",
2282
+ "doc",
2283
+ "docx",
2284
+ "ppt",
2285
+ "pptx",
2286
+ "xls",
2287
+ "xlsx",
2288
+ "txt",
2289
+ "css",
2290
+ "js",
2291
+ "xml",
2292
+ "json",
2293
+ "html",
2294
+ "htm",
2295
+ ];
2296
+ const addRefPrefix = (prefix, prefixesToRefs) => {
2297
+ if (!prefixesToRefs[prefix]) {
2298
+ prefixesToRefs[prefix] = "ref" + Object.values(prefixesToRefs).length;
2299
+ }
2300
+ return prefixesToRefs[prefix];
2301
+ };
2302
+ const processUrl = (url, prefixesToRefs) => {
2303
+ if (!url.startsWith("http")) {
2304
+ return url;
2305
+ } else {
2306
+ const mediaSuffix = url.split(".").slice(-1)[0];
2307
+ if (mediaSuffix && mediaSuffixes.includes(mediaSuffix)) {
2308
+ const parts = url.split("/"); // Split URL keeping the slash before text
2309
+ const prefix = parts.slice(0, -1).join("/"); // Get the prefix by removing last part
2310
+ const refPrefix = addRefPrefix(prefix, prefixesToRefs);
2311
+ return \`\${refPrefix}://\${parts.slice(-1).join("")}\`;
2312
+ } else {
2313
+ if (url.split("/").length > 4) {
2314
+ return addRefPrefix(url, prefixesToRefs);
2315
+ } else {
2316
+ return url;
2317
+ }
2318
+ }
2319
+ }
2320
+ };
2321
+ function refifyUrls(markdownElement, prefixesToRefs = {}) {
2322
+ if (Array.isArray(markdownElement)) {
2323
+ markdownElement.forEach((element) => refifyUrls(element, prefixesToRefs));
2324
+ } else {
2325
+ switch (markdownElement.type) {
2326
+ case "link":
2327
+ markdownElement.href = processUrl(
2328
+ markdownElement.href,
2329
+ prefixesToRefs
2330
+ );
2331
+ refifyUrls(markdownElement.content, prefixesToRefs);
2332
+ break;
2333
+ case "image":
2334
+ case "video":
2335
+ markdownElement.src = processUrl(markdownElement.src, prefixesToRefs);
2336
+ break;
2337
+ case "list":
2338
+ markdownElement.items.forEach((item) =>
2339
+ item.content.forEach((_) => refifyUrls(_, prefixesToRefs))
2340
+ );
2341
+ break;
2342
+ case "table":
2343
+ markdownElement.rows.forEach((row) =>
2344
+ row.cells.forEach((cell) =>
2345
+ typeof cell.content === "string"
2346
+ ? null
2347
+ : refifyUrls(cell.content, prefixesToRefs)
2348
+ )
2349
+ );
2350
+ break;
2351
+ case "blockquote":
2352
+ case "semanticHtml":
2353
+ refifyUrls(markdownElement.content, prefixesToRefs);
2354
+ break;
2355
+ }
2356
+ }
2357
+ return prefixesToRefs;
2358
+ }
2359
+
2360
+ var astUtils = {};
2361
+
2362
+ (function (exports) {
2363
+ Object.defineProperty(exports, "__esModule", { value: true });
2364
+ exports.isNot = exports.getMainContent = void 0;
2365
+ exports.findInAST = findInAST;
2366
+ exports.findAllInAST = findAllInAST;
2367
+ const getMainContent = (markdownStr) => {
2368
+ if (markdownStr.includes("<-main->")) {
2369
+ const regex = /(?<=<-main->)[\\s\\S]*?(?=<\\/-main->)/;
2370
+ const match = markdownStr.match(regex);
2371
+ return match?.[0] ?? "";
2372
+ } else {
2373
+ const removeSectionsRegex =
2374
+ /(<-nav->[\\s\\S]*?<\\/-nav->)|(<-footer->[\\s\\S]*?<\\/-footer->)|(<-header->[\\s\\S]*?<\\/-header->)|(<-aside->[\\s\\S]*?<\\/-aside->)/g;
2375
+ return markdownStr.replace(removeSectionsRegex, "");
2376
+ }
2377
+ };
2378
+ exports.getMainContent = getMainContent;
2379
+ const isNot = (tPred) => (t) => !tPred(t);
2380
+ exports.isNot = isNot;
2381
+ const isString = (x) => typeof x === "string";
2382
+ function findInAST(markdownElement, checker) {
2383
+ const loopCheck = (z) => {
2384
+ for (const element of z) {
2385
+ const found = findInAST(element, checker);
2386
+ if (found) {
2387
+ return found;
2388
+ }
2389
+ }
2390
+ return undefined;
2391
+ };
2392
+ if (Array.isArray(markdownElement)) {
2393
+ return loopCheck(markdownElement);
2394
+ } else {
2395
+ if (checker(markdownElement)) {
2396
+ return markdownElement;
2397
+ }
2398
+ switch (markdownElement.type) {
2399
+ case "link":
2400
+ return loopCheck(markdownElement.content);
2401
+ case "list":
2402
+ return loopCheck(
2403
+ markdownElement.items.map((_) => _.content).flat()
2404
+ );
2405
+ case "table":
2406
+ return loopCheck(
2407
+ markdownElement.rows
2408
+ .map((row) =>
2409
+ row.cells
2410
+ .map((_) => _.content)
2411
+ .filter((0, exports.isNot)(isString))
2412
+ )
2413
+ .flat()
2414
+ );
2415
+ case "blockquote":
2416
+ case "semanticHtml":
2417
+ return loopCheck(markdownElement.content);
2418
+ }
2419
+ return undefined;
2420
+ }
2421
+ }
2422
+ function findAllInAST(markdownElement, checker) {
2423
+ const loopCheck = (z) => {
2424
+ let out = [];
2425
+ for (const element of z) {
2426
+ const found = findAllInAST(element, checker);
2427
+ out = [...out, ...found];
2428
+ }
2429
+ return out;
2430
+ };
2431
+ if (Array.isArray(markdownElement)) {
2432
+ return loopCheck(markdownElement);
2433
+ } else {
2434
+ if (checker(markdownElement)) {
2435
+ return [markdownElement];
2436
+ }
2437
+ switch (markdownElement.type) {
2438
+ case "link":
2439
+ return loopCheck(markdownElement.content);
2440
+ case "list":
2441
+ return loopCheck(
2442
+ markdownElement.items.map((_) => _.content).flat()
2443
+ );
2444
+ case "table":
2445
+ return loopCheck(
2446
+ markdownElement.rows
2447
+ .map((row) =>
2448
+ row.cells
2449
+ .map((_) => _.content)
2450
+ .filter((0, exports.isNot)(isString))
2451
+ )
2452
+ .flat()
2453
+ );
2454
+ case "blockquote":
2455
+ case "semanticHtml":
2456
+ return loopCheck(markdownElement.content);
2457
+ }
2458
+ return [];
2459
+ }
2460
+ }
2461
+ })(astUtils);
2462
+
2463
+ var hasRequiredNode;
2464
+
2465
+ function requireNode() {
2466
+ if (hasRequiredNode) return node;
2467
+ hasRequiredNode = 1;
2468
+ (function (exports) {
2469
+ Object.defineProperty(exports, "__esModule", { value: true });
2470
+ exports.wrapMainContent =
2471
+ exports.refifyUrls =
2472
+ exports.findMainContent =
2473
+ exports.markdownASTToString =
2474
+ exports.htmlToMarkdownAST =
2475
+ void 0;
2476
+ exports.convertHtmlToMarkdown = convertHtmlToMarkdown;
2477
+ exports.convertElementToMarkdown = convertElementToMarkdown;
2478
+ exports.findInMarkdownAST = findInMarkdownAST;
2479
+ exports.findAllInMarkdownAST = findAllInMarkdownAST;
2480
+ const htmlToMarkdownAST_1 = htmlToMarkdownAST$1;
2481
+ Object.defineProperty(exports, "htmlToMarkdownAST", {
2482
+ enumerable: true,
2483
+ get: function () {
2484
+ return htmlToMarkdownAST_1.htmlToMarkdownAST;
2485
+ },
2486
+ });
2487
+ const markdownASTToString_1 = requireMarkdownASTToString();
2488
+ Object.defineProperty(exports, "markdownASTToString", {
2489
+ enumerable: true,
2490
+ get: function () {
2491
+ return markdownASTToString_1.markdownASTToString;
2492
+ },
2493
+ });
2494
+ const domUtils_1 = domUtils;
2495
+ Object.defineProperty(exports, "findMainContent", {
2496
+ enumerable: true,
2497
+ get: function () {
2498
+ return domUtils_1.findMainContent;
2499
+ },
2500
+ });
2501
+ Object.defineProperty(exports, "wrapMainContent", {
2502
+ enumerable: true,
2503
+ get: function () {
2504
+ return domUtils_1.wrapMainContent;
2505
+ },
2506
+ });
2507
+ const urlUtils_1 = urlUtils;
2508
+ Object.defineProperty(exports, "refifyUrls", {
2509
+ enumerable: true,
2510
+ get: function () {
2511
+ return urlUtils_1.refifyUrls;
2512
+ },
2513
+ });
2514
+ const astUtils_1 = astUtils;
2515
+ /**
2516
+ * Converts an HTML string to Markdown.
2517
+ * @param html The HTML string to convert.
2518
+ * @param options Conversion options.
2519
+ * @returns The converted Markdown string.
2520
+ */
2521
+ function convertHtmlToMarkdown(html, options) {
2522
+ const parser =
2523
+ options?.overrideDOMParser ??
2524
+ (typeof DOMParser !== "undefined" ? new DOMParser() : null);
2525
+ if (!parser) {
2526
+ throw new Error(
2527
+ "DOMParser is not available. Please provide an overrideDOMParser in options."
2528
+ );
2529
+ }
2530
+ const doc = parser.parseFromString(html, "text/html");
2531
+ let element;
2532
+ if (options?.extractMainContent) {
2533
+ element = (0, domUtils_1.findMainContent)(doc);
2534
+ if (
2535
+ options.includeMetaData &&
2536
+ !!doc.querySelector("head")?.innerHTML &&
2537
+ !element.querySelector("head")
2538
+ ) {
2539
+ // content container was found and extracted, re-attaching the head for meta-data extraction
2540
+ element = parser.parseFromString(
2541
+ \`<html>\${doc.head.outerHTML}\${element.outerHTML}\`,
2542
+ "text/html"
2543
+ ).documentElement;
2544
+ }
2545
+ } else {
2546
+ // If there's a body, use it; otherwise, use the document element
2547
+ if (
2548
+ options?.includeMetaData &&
2549
+ !!doc.querySelector("head")?.innerHTML
2550
+ ) {
2551
+ element = doc.documentElement;
2552
+ } else {
2553
+ element = doc.body || doc.documentElement;
2554
+ }
2555
+ }
2556
+ return convertElementToMarkdown(element, options);
2557
+ }
2558
+ /**
2559
+ * Converts an HTML Element to Markdown.
2560
+ * @param element The HTML Element to convert.
2561
+ * @param options Conversion options.
2562
+ * @returns The converted Markdown string.
2563
+ */
2564
+ function convertElementToMarkdown(element, options) {
2565
+ let ast = (0, htmlToMarkdownAST_1.htmlToMarkdownAST)(element, options);
2566
+ if (options?.refifyUrls) {
2567
+ options.urlMap = (0, urlUtils_1.refifyUrls)(ast);
2568
+ }
2569
+ return (0, markdownASTToString_1.markdownASTToString)(ast, options);
2570
+ }
2571
+ /**
2572
+ * Finds a node in the Markdown AST that matches the given predicate.
2573
+ * @param ast The Markdown AST to search.
2574
+ * @param predicate A function that returns true for the desired node.
2575
+ * @returns The first matching node, or undefined if not found.
2576
+ */
2577
+ function findInMarkdownAST(ast, predicate) {
2578
+ return (0, astUtils_1.findInAST)(ast, predicate);
2579
+ }
2580
+ /**
2581
+ * Finds all nodes in the Markdown AST that match the given predicate.
2582
+ * @param ast The Markdown AST to search.
2583
+ * @param predicate A function that returns true for the desired nodes.
2584
+ * @returns An array of all matching nodes.
2585
+ */
2586
+ function findAllInMarkdownAST(ast, predicate) {
2587
+ return (0, astUtils_1.findAllInAST)(ast, predicate);
2588
+ }
2589
+ })(node);
2590
+ return node;
2591
+ }
2592
+
2593
+ var nodeExports = requireNode();
2594
+
2595
+ //@ts-ignore
2596
+ window.__INTUNED__ = {
2597
+ matchStringsWithDomContent,
2598
+ convertElementToMarkdown,
2599
+ convertHtmlStringToSemanticMarkdown: nodeExports.convertHtmlToMarkdown,
2600
+ getElementXPath: getElementXPath,
2601
+ };
2602
+ })();`;