@tricoteuses/tisseuse 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.md +22 -0
- package/README.md +5 -0
- package/dist/index.js +707 -0
- package/dist/lib/asserts.d.ts +1 -0
- package/dist/lib/index.d.ts +3 -0
- package/dist/lib/numbers.d.ts +6 -0
- package/dist/lib/server/auditors/config.d.ts +4 -0
- package/dist/lib/server/config.d.ts +18 -0
- package/dist/lib/server/databases/index.d.ts +10 -0
- package/dist/lib/server/text_links.d.ts +46 -0
- package/dist/lib/server/text_links.test.d.ts +1 -0
- package/dist/lib/server/text_parsers/transformers.d.ts +3 -0
- package/dist/lib/strings.d.ts +1 -0
- package/dist/lib/text_parsers/actions.d.ts +1 -0
- package/dist/lib/text_parsers/actions.test.d.ts +1 -0
- package/dist/lib/text_parsers/articles.d.ts +53 -0
- package/dist/lib/text_parsers/articles.test.d.ts +1 -0
- package/dist/lib/text_parsers/ast.d.ts +136 -0
- package/dist/lib/text_parsers/citations.d.ts +7 -0
- package/dist/lib/text_parsers/citations.test.d.ts +1 -0
- package/dist/lib/text_parsers/dates.d.ts +6 -0
- package/dist/lib/text_parsers/dates.test.d.ts +1 -0
- package/dist/lib/text_parsers/divisions.d.ts +29 -0
- package/dist/lib/text_parsers/divisions.test.d.ts +1 -0
- package/dist/lib/text_parsers/helpers.d.ts +10 -0
- package/dist/lib/text_parsers/index.d.ts +4 -0
- package/dist/lib/text_parsers/index.test.d.ts +1 -0
- package/dist/lib/text_parsers/numbers.d.ts +17 -0
- package/dist/lib/text_parsers/numbers.test.d.ts +1 -0
- package/dist/lib/text_parsers/parsers.d.ts +51 -0
- package/dist/lib/text_parsers/parsers.test.d.ts +1 -0
- package/dist/lib/text_parsers/portions.d.ts +53 -0
- package/dist/lib/text_parsers/portions.test.d.ts +1 -0
- package/dist/lib/text_parsers/positions.d.ts +4 -0
- package/dist/lib/text_parsers/prepositions.d.ts +6 -0
- package/dist/lib/text_parsers/prepositions.test.d.ts +1 -0
- package/dist/lib/text_parsers/references.d.ts +13 -0
- package/dist/lib/text_parsers/references.test.d.ts +1 -0
- package/dist/lib/text_parsers/relative_locations.d.ts +9 -0
- package/dist/lib/text_parsers/relative_locations.test.d.ts +1 -0
- package/dist/lib/text_parsers/separators.d.ts +7 -0
- package/dist/lib/text_parsers/simplifiers.d.ts +13 -0
- package/dist/lib/text_parsers/simplifiers.test.d.ts +1 -0
- package/dist/lib/text_parsers/texts.d.ts +42 -0
- package/dist/lib/text_parsers/texts.test.d.ts +1 -0
- package/dist/lib/text_parsers/transformers.d.ts +53 -0
- package/dist/lib/text_parsers/typography.d.ts +9 -0
- package/dist/lib/text_parsers/typography.test.d.ts +1 -0
- package/dist/scripts/add_links_to_html_document.d.ts +1 -0
- package/dist/scripts/add_references_to_html_document.d.ts +6 -0
- package/dist/scripts/extract_texts_infos.d.ts +1 -0
- package/dist/scripts/extract_texts_titles_infos.d.ts +1 -0
- package/dist/scripts/html_document_to_text.d.ts +1 -0
- package/package.json +69 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,707 @@
|
|
|
1
|
+
function _(u, n) {
|
|
2
|
+
throw `Unexpected type ${u}: ${n}`;
|
|
3
|
+
}
|
|
4
|
+
const N = /^<\/?(!DOCTYPE|\?XML|[A-Z][A-Z0-9]*)/i;
|
|
5
|
+
function H(u, n) {
|
|
6
|
+
return (t) => {
|
|
7
|
+
const s = [];
|
|
8
|
+
let e = t;
|
|
9
|
+
for (const i of n) {
|
|
10
|
+
const r = i(e);
|
|
11
|
+
(r.transformations !== void 0 || r.sourceMap.length !== 0) && (s.push(r), e = r.output);
|
|
12
|
+
}
|
|
13
|
+
return { input: t, output: e, title: u, transformations: s };
|
|
14
|
+
};
|
|
15
|
+
}
|
|
16
|
+
function* $(u) {
|
|
17
|
+
if (u.transformations === void 0)
|
|
18
|
+
yield u;
|
|
19
|
+
else
|
|
20
|
+
for (const n of u.transformations)
|
|
21
|
+
yield* $(n);
|
|
22
|
+
}
|
|
23
|
+
function* U(u) {
|
|
24
|
+
const n = [
|
|
25
|
+
...$(u)
|
|
26
|
+
].reverse();
|
|
27
|
+
let t = {
|
|
28
|
+
position: { start: 0, stop: 0 }
|
|
29
|
+
};
|
|
30
|
+
const s = n.map(
|
|
31
|
+
(i) => {
|
|
32
|
+
const r = y(
|
|
33
|
+
i
|
|
34
|
+
);
|
|
35
|
+
return r.next(t), r;
|
|
36
|
+
}
|
|
37
|
+
);
|
|
38
|
+
let e = yield t;
|
|
39
|
+
for (; e !== void 0; ) {
|
|
40
|
+
t = {
|
|
41
|
+
position: e
|
|
42
|
+
};
|
|
43
|
+
for (const i of s) {
|
|
44
|
+
const r = i.next(t);
|
|
45
|
+
if (r.done)
|
|
46
|
+
return;
|
|
47
|
+
t = r.value;
|
|
48
|
+
}
|
|
49
|
+
e = yield t;
|
|
50
|
+
}
|
|
51
|
+
for (const i of s)
|
|
52
|
+
i.next(void 0);
|
|
53
|
+
}
|
|
54
|
+
function* y(u) {
|
|
55
|
+
let n, t = "looking_for_start_segment", s = yield {
|
|
56
|
+
position: { start: 0, stop: 0 }
|
|
57
|
+
};
|
|
58
|
+
const e = [
|
|
59
|
+
{ inputIndex: 0, inputLength: 0, outputIndex: 0, outputLength: 0 },
|
|
60
|
+
...u.sourceMap,
|
|
61
|
+
{
|
|
62
|
+
inputIndex: Number.MAX_SAFE_INTEGER,
|
|
63
|
+
inputLength: 0,
|
|
64
|
+
outputIndex: Number.MAX_SAFE_INTEGER,
|
|
65
|
+
outputLength: 0
|
|
66
|
+
}
|
|
67
|
+
];
|
|
68
|
+
for (const [i, r] of e.entries())
|
|
69
|
+
for (let c = !1; !c; ) {
|
|
70
|
+
if (s === void 0)
|
|
71
|
+
return;
|
|
72
|
+
let { position: o } = s, p, l, d, g;
|
|
73
|
+
switch (t) {
|
|
74
|
+
case "looking_for_start_segment": {
|
|
75
|
+
const x = e[i + 1];
|
|
76
|
+
x !== void 0 && x.outputIndex + x.outputLength <= o.start ? c = !0 : (n = i, t = "looking_for_stop_segment");
|
|
77
|
+
break;
|
|
78
|
+
}
|
|
79
|
+
case "looking_for_stop_segment": {
|
|
80
|
+
if (r.outputIndex < o.stop)
|
|
81
|
+
c = !0;
|
|
82
|
+
else {
|
|
83
|
+
const x = e[n], T = i, f = r;
|
|
84
|
+
let m = o.start < x.outputIndex + x.outputLength ? n : n + 1, a = o.stop > f.outputIndex ? T : T - 1;
|
|
85
|
+
for (let M = !0; M; ) {
|
|
86
|
+
M = !1, p = void 0, l = void 0, d = void 0, g = void 0;
|
|
87
|
+
for (let P = m; P <= a; P++) {
|
|
88
|
+
const E = e[P].matchingSegmentIndex;
|
|
89
|
+
if (E !== void 0) {
|
|
90
|
+
if (E + 1 < m) {
|
|
91
|
+
let R = !1;
|
|
92
|
+
const L = e[E + 1];
|
|
93
|
+
if (L.openingTag !== void 0 && L.outputIndex + L.outputLength < o.start) {
|
|
94
|
+
const A = L.openingTag.match(N);
|
|
95
|
+
if (A !== null) {
|
|
96
|
+
const v = A[1], O = v.toUpperCase();
|
|
97
|
+
[
|
|
98
|
+
"B",
|
|
99
|
+
"EM",
|
|
100
|
+
"I",
|
|
101
|
+
"SPAN",
|
|
102
|
+
"STRONG",
|
|
103
|
+
"SUB",
|
|
104
|
+
"SUP"
|
|
105
|
+
].includes(O) && (d = `${d ?? ""}</${v}>`, p = `${L.openingTag}${p ?? ""}`, R = !0);
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
if (!R) {
|
|
109
|
+
p = void 0, d = void 0, m = E + 1, o = {
|
|
110
|
+
start: e[m].outputIndex,
|
|
111
|
+
stop: o.stop
|
|
112
|
+
}, M = !0;
|
|
113
|
+
break;
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
if (E + 1 > a) {
|
|
117
|
+
let R = !1;
|
|
118
|
+
const L = e[P], A = e[E + 1];
|
|
119
|
+
if (L.openingTag !== void 0 && o.stop < A.outputIndex) {
|
|
120
|
+
const v = L.openingTag.match(N);
|
|
121
|
+
if (v !== null) {
|
|
122
|
+
const O = v[1], F = O.toUpperCase();
|
|
123
|
+
[
|
|
124
|
+
"B",
|
|
125
|
+
"EM",
|
|
126
|
+
"I",
|
|
127
|
+
"SPAN",
|
|
128
|
+
"STRONG",
|
|
129
|
+
"SUB",
|
|
130
|
+
"SUP"
|
|
131
|
+
].includes(F) && (l = `${l ?? ""}</${O}>`, g = `${L.openingTag}${g ?? ""}`, R = !0);
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
if (!R) {
|
|
135
|
+
l = void 0, g = void 0, a = E + 1;
|
|
136
|
+
const v = e[a];
|
|
137
|
+
o = {
|
|
138
|
+
start: o.start,
|
|
139
|
+
stop: v.outputIndex + v.outputLength
|
|
140
|
+
}, M = !0;
|
|
141
|
+
break;
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
const h = e[m - 1], S = h.inputIndex + h.inputLength + o.start - (h.outputIndex + h.outputLength), I = e[a], C = I.inputIndex + I.inputLength + o.stop - (I.outputIndex + I.outputLength);
|
|
148
|
+
s = yield Object.fromEntries(
|
|
149
|
+
Object.entries({
|
|
150
|
+
innerPrefix: `${p ?? ""}${s.innerPrefix ?? ""}` || void 0,
|
|
151
|
+
innerSuffix: `${s.innerSuffix ?? ""}${l ?? ""}` || void 0,
|
|
152
|
+
outerPrefix: `${s.outerPrefix ?? ""}${d ?? ""}` || void 0,
|
|
153
|
+
outerSuffix: `${g ?? ""}${s.outerSuffix ?? ""}` || void 0,
|
|
154
|
+
position: {
|
|
155
|
+
start: S,
|
|
156
|
+
stop: C
|
|
157
|
+
}
|
|
158
|
+
}).filter(([, M]) => M !== void 0)
|
|
159
|
+
), t = "looking_for_start_segment";
|
|
160
|
+
}
|
|
161
|
+
break;
|
|
162
|
+
}
|
|
163
|
+
default:
|
|
164
|
+
_(
|
|
165
|
+
"iterOriginalMergedPositionsFromTransformedUsingTransformationLeaf.state",
|
|
166
|
+
t
|
|
167
|
+
);
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
function q(u, n) {
|
|
172
|
+
const t = U(u);
|
|
173
|
+
return t.next({ start: 0, stop: 0 }), n.map((s) => {
|
|
174
|
+
const e = t.next(s);
|
|
175
|
+
if (e.done)
|
|
176
|
+
throw new Error(
|
|
177
|
+
`Reverse transformation of position failed: ${s}`
|
|
178
|
+
);
|
|
179
|
+
return e.value;
|
|
180
|
+
});
|
|
181
|
+
}
|
|
182
|
+
function W(u, n) {
|
|
183
|
+
for (const { sourceMap: t } of [
|
|
184
|
+
...$(u)
|
|
185
|
+
].reverse())
|
|
186
|
+
n = b(
|
|
187
|
+
t,
|
|
188
|
+
n
|
|
189
|
+
);
|
|
190
|
+
return n;
|
|
191
|
+
}
|
|
192
|
+
function b(u, n) {
|
|
193
|
+
const t = [];
|
|
194
|
+
u = [
|
|
195
|
+
{ inputIndex: 0, inputLength: 0, outputIndex: 0, outputLength: 0 },
|
|
196
|
+
...u,
|
|
197
|
+
{
|
|
198
|
+
inputIndex: Number.MAX_SAFE_INTEGER,
|
|
199
|
+
inputLength: 0,
|
|
200
|
+
outputIndex: Number.MAX_SAFE_INTEGER,
|
|
201
|
+
outputLength: 0
|
|
202
|
+
}
|
|
203
|
+
];
|
|
204
|
+
let s = 0, e = u[s];
|
|
205
|
+
for (const i of n) {
|
|
206
|
+
let { start: r } = i;
|
|
207
|
+
const { stop: c } = i;
|
|
208
|
+
t: for (let o = !1; !o; ) {
|
|
209
|
+
for (; e.outputIndex + e.outputLength <= r; s++, e = u[s]) ;
|
|
210
|
+
let p = s;
|
|
211
|
+
const l = u[p - 1];
|
|
212
|
+
let d = l.inputIndex + l.inputLength + r - (l.outputIndex + l.outputLength), g;
|
|
213
|
+
for (g = p - 1; u[g + 1].outputIndex < c; g++) ;
|
|
214
|
+
const x = u[g];
|
|
215
|
+
let T = x.inputIndex + x.inputLength + c - (x.outputIndex + x.outputLength);
|
|
216
|
+
for (let f = p; f <= g; f++) {
|
|
217
|
+
const m = u[f], a = m.matchingSegmentIndex;
|
|
218
|
+
if (a !== void 0) {
|
|
219
|
+
if (a + 1 < p) {
|
|
220
|
+
const h = u[a + 1];
|
|
221
|
+
if (h.outputIndex < r) {
|
|
222
|
+
m.inputIndex > d && t.push({
|
|
223
|
+
start: d,
|
|
224
|
+
stop: m.inputIndex
|
|
225
|
+
}), r = m.outputIndex + m.outputLength;
|
|
226
|
+
for (let S = f, I = m; I.outputIndex + I.outputLength === r; S++, I = u[S])
|
|
227
|
+
s = S;
|
|
228
|
+
continue t;
|
|
229
|
+
}
|
|
230
|
+
p = a + 1, d = h.inputIndex;
|
|
231
|
+
} else if (a + 1 > g) {
|
|
232
|
+
const h = u[a + 1];
|
|
233
|
+
if (h.outputIndex + h.outputLength > c) {
|
|
234
|
+
m.inputIndex > d && t.push({
|
|
235
|
+
start: d,
|
|
236
|
+
stop: m.inputIndex
|
|
237
|
+
}), r = m.outputIndex + m.outputLength;
|
|
238
|
+
for (let S = f, I = m; I.outputIndex + I.outputLength === r; S++, I = u[S])
|
|
239
|
+
s = S;
|
|
240
|
+
continue t;
|
|
241
|
+
}
|
|
242
|
+
g = a + 1, T = h.inputIndex + h.inputLength;
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
t.push({
|
|
247
|
+
start: d,
|
|
248
|
+
stop: T
|
|
249
|
+
}), o = !0;
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
return t;
|
|
253
|
+
}
|
|
254
|
+
const B = {
|
|
255
|
+
amp: "&",
|
|
256
|
+
apos: "'",
|
|
257
|
+
asymp: "≈",
|
|
258
|
+
copy: "©",
|
|
259
|
+
deg: "°",
|
|
260
|
+
euro: "€",
|
|
261
|
+
gt: ">",
|
|
262
|
+
lt: "<",
|
|
263
|
+
mdash: "—",
|
|
264
|
+
nbsp: " ",
|
|
265
|
+
ndash: "–",
|
|
266
|
+
ne: "≠",
|
|
267
|
+
pound: "£",
|
|
268
|
+
quot: '"',
|
|
269
|
+
reg: "®",
|
|
270
|
+
trade: "™"
|
|
271
|
+
};
|
|
272
|
+
function k({
|
|
273
|
+
removeAWithHref: u
|
|
274
|
+
} = {}) {
|
|
275
|
+
return (n) => {
|
|
276
|
+
let t = 0, s = 0, e = [], i = [];
|
|
277
|
+
const r = [], c = "Conversion des éléments HTML en texte";
|
|
278
|
+
for (; t < n.length; ) {
|
|
279
|
+
const o = n.indexOf("<", t);
|
|
280
|
+
if (o === -1)
|
|
281
|
+
return e.push(n.slice(t).replace(/[\n\r]/g, " ")), {
|
|
282
|
+
input: n,
|
|
283
|
+
output: e.join(""),
|
|
284
|
+
sourceMap: i,
|
|
285
|
+
title: c
|
|
286
|
+
};
|
|
287
|
+
const p = n.indexOf(">", o);
|
|
288
|
+
if (p === -1)
|
|
289
|
+
return e.push(n.slice(t).replace(/[\n\r]/g, " ")), {
|
|
290
|
+
input: n,
|
|
291
|
+
output: e.join(""),
|
|
292
|
+
sourceMap: i,
|
|
293
|
+
title: c
|
|
294
|
+
};
|
|
295
|
+
const l = n.slice(o, p + 1), d = l.startsWith("</"), g = l.length, x = l.match(N);
|
|
296
|
+
if (x === null) {
|
|
297
|
+
e.push(
|
|
298
|
+
n.slice(t, p + 1).replace(/[\n\r]/g, " ")
|
|
299
|
+
), t = p + 1;
|
|
300
|
+
continue;
|
|
301
|
+
}
|
|
302
|
+
const f = x[1].toUpperCase();
|
|
303
|
+
if (l.endsWith("/>") || [
|
|
304
|
+
"!DOCTYPE",
|
|
305
|
+
"?XML",
|
|
306
|
+
"AREA",
|
|
307
|
+
"BASE",
|
|
308
|
+
"BR",
|
|
309
|
+
"COL",
|
|
310
|
+
"EMBED",
|
|
311
|
+
"HR",
|
|
312
|
+
"IMG",
|
|
313
|
+
"INPUT",
|
|
314
|
+
"LINK",
|
|
315
|
+
"META",
|
|
316
|
+
"PARAM",
|
|
317
|
+
"SOURCE",
|
|
318
|
+
"TRACK",
|
|
319
|
+
"WBR"
|
|
320
|
+
].includes(f))
|
|
321
|
+
if (d)
|
|
322
|
+
o > t && e.push(
|
|
323
|
+
n.slice(t, o).replace(/[\n\r]/g, " ")
|
|
324
|
+
), i.push({
|
|
325
|
+
inputIndex: o,
|
|
326
|
+
inputLength: g,
|
|
327
|
+
outputIndex: o + s,
|
|
328
|
+
outputLength: 0
|
|
329
|
+
}), s -= g;
|
|
330
|
+
else if (["BR", "HR"].includes(f)) {
|
|
331
|
+
o > t && e.push(
|
|
332
|
+
n.slice(t, o).replace(/[\n\r]/g, " ")
|
|
333
|
+
);
|
|
334
|
+
const a = p + 1 - o;
|
|
335
|
+
e.push(`
|
|
336
|
+
`), i.push({
|
|
337
|
+
inputIndex: o,
|
|
338
|
+
inputLength: a,
|
|
339
|
+
openingTag: l,
|
|
340
|
+
outputIndex: o + s,
|
|
341
|
+
outputLength: 1
|
|
342
|
+
}), s += 1 - a;
|
|
343
|
+
} else ["!DOCTYPE", "?XML", "COL", "IMG", "INPUT"].includes(f) ? (o > t && e.push(
|
|
344
|
+
n.slice(t, o).replace(/[\n\r]/g, " ")
|
|
345
|
+
), i.push({
|
|
346
|
+
inputIndex: o,
|
|
347
|
+
inputLength: g,
|
|
348
|
+
openingTag: l,
|
|
349
|
+
outputIndex: o + s,
|
|
350
|
+
outputLength: 0
|
|
351
|
+
}), s -= g) : e.push(
|
|
352
|
+
n.slice(t, p + 1).replace(/[\n\r]/g, " ")
|
|
353
|
+
);
|
|
354
|
+
else if (d) {
|
|
355
|
+
const a = r.at(-1);
|
|
356
|
+
if (f === a?.name)
|
|
357
|
+
switch (r.pop(), a.action) {
|
|
358
|
+
case void 0: {
|
|
359
|
+
e.push(
|
|
360
|
+
n.slice(t, p + 1).replace(/[\n\r]/g, " ")
|
|
361
|
+
);
|
|
362
|
+
break;
|
|
363
|
+
}
|
|
364
|
+
case "ignore": {
|
|
365
|
+
e = a.outputFragments, s = a.outputOffset, i = a.sourceMap;
|
|
366
|
+
const h = p + 1 - a.inputIndex;
|
|
367
|
+
i.push({
|
|
368
|
+
inputIndex: a.inputIndex,
|
|
369
|
+
inputLength: h,
|
|
370
|
+
outputIndex: a.inputIndex + s,
|
|
371
|
+
outputLength: 0
|
|
372
|
+
}), s = s - h;
|
|
373
|
+
break;
|
|
374
|
+
}
|
|
375
|
+
case "keep_content": {
|
|
376
|
+
e.push(
|
|
377
|
+
n.slice(t, o).replace(/[\n\r]/g, " ")
|
|
378
|
+
), a.closingTagReplacement.length !== 0 && e.push(a.closingTagReplacement);
|
|
379
|
+
const h = p + 1 - o;
|
|
380
|
+
i[a.openingSegmentIndex].matchingSegmentIndex = i.length, i.push({
|
|
381
|
+
inputIndex: o,
|
|
382
|
+
inputLength: h,
|
|
383
|
+
matchingSegmentIndex: a.openingSegmentIndex,
|
|
384
|
+
outputIndex: o + s,
|
|
385
|
+
outputLength: a.closingTagReplacement.length
|
|
386
|
+
}), s += a.closingTagReplacement.length - h;
|
|
387
|
+
break;
|
|
388
|
+
}
|
|
389
|
+
default:
|
|
390
|
+
_("TagInfos.action", a);
|
|
391
|
+
}
|
|
392
|
+
else
|
|
393
|
+
o > t && e.push(
|
|
394
|
+
n.slice(t, o).replace(/[\n\r]/g, " ")
|
|
395
|
+
), i.push({
|
|
396
|
+
inputIndex: o,
|
|
397
|
+
inputLength: g,
|
|
398
|
+
outputIndex: o + s,
|
|
399
|
+
outputLength: 0
|
|
400
|
+
}), s -= g;
|
|
401
|
+
} else if (["COLGROUP", "HEAD", "SCRIPT", "STYLE"].includes(f) || u && f === "A" && / href=/i.test(l))
|
|
402
|
+
o > t && e.push(
|
|
403
|
+
n.slice(t, o).replace(/[\n\r]/g, " ")
|
|
404
|
+
), r.push({
|
|
405
|
+
action: "ignore",
|
|
406
|
+
inputIndex: o,
|
|
407
|
+
name: f,
|
|
408
|
+
// Backup outputFragments, outputOffset & sourceMap, because
|
|
409
|
+
// every changes made inside ignored element will be ignored.
|
|
410
|
+
outputFragments: e,
|
|
411
|
+
outputOffset: s,
|
|
412
|
+
sourceMap: i
|
|
413
|
+
}), e = [], i = [];
|
|
414
|
+
else if ([
|
|
415
|
+
"A",
|
|
416
|
+
// When removeAWithHref is false or no href
|
|
417
|
+
"B",
|
|
418
|
+
"BODY",
|
|
419
|
+
"DL",
|
|
420
|
+
"EM",
|
|
421
|
+
"HTML",
|
|
422
|
+
"I",
|
|
423
|
+
"OL",
|
|
424
|
+
"SPAN",
|
|
425
|
+
"STRONG",
|
|
426
|
+
"SUB",
|
|
427
|
+
"SUP",
|
|
428
|
+
"TABLE",
|
|
429
|
+
"TBODY",
|
|
430
|
+
"THEAD",
|
|
431
|
+
"TR",
|
|
432
|
+
"UL"
|
|
433
|
+
].includes(f)) {
|
|
434
|
+
o > t && e.push(
|
|
435
|
+
n.slice(t, o).replace(/[\n\r]/g, " ")
|
|
436
|
+
);
|
|
437
|
+
const a = i.length;
|
|
438
|
+
i.push({
|
|
439
|
+
inputIndex: o,
|
|
440
|
+
inputLength: g,
|
|
441
|
+
openingTag: l,
|
|
442
|
+
outputIndex: o + s,
|
|
443
|
+
outputLength: 0
|
|
444
|
+
}), s -= g, r.push({
|
|
445
|
+
action: "keep_content",
|
|
446
|
+
closingTagReplacement: "",
|
|
447
|
+
name: f,
|
|
448
|
+
openingSegmentIndex: a
|
|
449
|
+
});
|
|
450
|
+
} else if ([
|
|
451
|
+
"CAPTION",
|
|
452
|
+
"DD",
|
|
453
|
+
"DT",
|
|
454
|
+
"H1",
|
|
455
|
+
"H2",
|
|
456
|
+
"H3",
|
|
457
|
+
"H4",
|
|
458
|
+
"H5",
|
|
459
|
+
"H6",
|
|
460
|
+
"DIV",
|
|
461
|
+
"FORM",
|
|
462
|
+
"LI",
|
|
463
|
+
"P",
|
|
464
|
+
"TD",
|
|
465
|
+
"TH"
|
|
466
|
+
].includes(f)) {
|
|
467
|
+
o > t && e.push(
|
|
468
|
+
n.slice(t, o).replace(/[\n\r]/g, " ")
|
|
469
|
+
), e.push(`
|
|
470
|
+
`);
|
|
471
|
+
const a = i.length;
|
|
472
|
+
i.push({
|
|
473
|
+
inputIndex: o,
|
|
474
|
+
inputLength: g,
|
|
475
|
+
openingTag: l,
|
|
476
|
+
outputIndex: o + s,
|
|
477
|
+
outputLength: 1
|
|
478
|
+
}), s += 1 - g, r.push({
|
|
479
|
+
action: "keep_content",
|
|
480
|
+
closingTagReplacement: `
|
|
481
|
+
`,
|
|
482
|
+
name: f,
|
|
483
|
+
openingSegmentIndex: a
|
|
484
|
+
});
|
|
485
|
+
} else
|
|
486
|
+
e.push(
|
|
487
|
+
n.slice(t, p + 1).replace(/[\n\r]/g, " ")
|
|
488
|
+
), r.push({
|
|
489
|
+
name: f
|
|
490
|
+
});
|
|
491
|
+
t = p + 1;
|
|
492
|
+
}
|
|
493
|
+
return {
|
|
494
|
+
input: n,
|
|
495
|
+
output: e.join(""),
|
|
496
|
+
sourceMap: i,
|
|
497
|
+
title: c
|
|
498
|
+
};
|
|
499
|
+
};
|
|
500
|
+
}
|
|
501
|
+
function D(u) {
|
|
502
|
+
let n = 0;
|
|
503
|
+
const t = [], s = u.replace(
|
|
504
|
+
/&(amp|apos|asymp|copy|deg|euro|gt|lt|mdash|nbsp|ndash|ne|pound|quot|reg|trade);/gi,
|
|
505
|
+
(e, i, r) => {
|
|
506
|
+
const c = B[i.toLowerCase()];
|
|
507
|
+
return t.push({
|
|
508
|
+
inputIndex: r,
|
|
509
|
+
inputLength: e.length,
|
|
510
|
+
outputIndex: r + n,
|
|
511
|
+
outputLength: c.length
|
|
512
|
+
}), n += c.length - e.length, c;
|
|
513
|
+
}
|
|
514
|
+
);
|
|
515
|
+
return {
|
|
516
|
+
input: u,
|
|
517
|
+
output: s,
|
|
518
|
+
sourceMap: t,
|
|
519
|
+
title: "Décodage des entités HTML nommées"
|
|
520
|
+
};
|
|
521
|
+
}
|
|
522
|
+
function G(u) {
|
|
523
|
+
let n = 0;
|
|
524
|
+
const t = [], s = u.replace(
|
|
525
|
+
/&#(?:(\d+)|x([0-9A-F]+));/gi,
|
|
526
|
+
(e, i, r, c) => {
|
|
527
|
+
const o = parseInt(
|
|
528
|
+
i ?? r,
|
|
529
|
+
i === void 0 ? 16 : 10
|
|
530
|
+
), p = String.fromCharCode(o);
|
|
531
|
+
return t.push({
|
|
532
|
+
inputIndex: c,
|
|
533
|
+
inputLength: e.length,
|
|
534
|
+
outputIndex: c + n,
|
|
535
|
+
outputLength: p.length
|
|
536
|
+
}), n += p.length - e.length, p;
|
|
537
|
+
}
|
|
538
|
+
);
|
|
539
|
+
return {
|
|
540
|
+
input: u,
|
|
541
|
+
output: s,
|
|
542
|
+
sourceMap: t,
|
|
543
|
+
title: "Décodage des entités HTML numériques"
|
|
544
|
+
};
|
|
545
|
+
}
|
|
546
|
+
function w(u, n) {
|
|
547
|
+
return (t) => {
|
|
548
|
+
const s = [], e = t.replaceAll(u, (r, ...c) => {
|
|
549
|
+
const o = c.at(-2);
|
|
550
|
+
let p = n;
|
|
551
|
+
for (const [l, d] of c.slice(0, -2).entries())
|
|
552
|
+
p = p.replaceAll(`$${l + 1}`, d);
|
|
553
|
+
return s.push({
|
|
554
|
+
inputIndex: o,
|
|
555
|
+
inputLength: r.length,
|
|
556
|
+
// Note: `outputIndex` is added below.
|
|
557
|
+
outputLength: p.length
|
|
558
|
+
}), p;
|
|
559
|
+
});
|
|
560
|
+
let i = 0;
|
|
561
|
+
for (const r of s)
|
|
562
|
+
r.outputIndex = r.inputIndex + i, i += r.outputLength - r.inputLength;
|
|
563
|
+
return {
|
|
564
|
+
input: t,
|
|
565
|
+
output: e,
|
|
566
|
+
sourceMap: s,
|
|
567
|
+
title: `Remplacement de ${u} par ${JSON.stringify(n)}`
|
|
568
|
+
};
|
|
569
|
+
};
|
|
570
|
+
}
|
|
571
|
+
function X(u) {
|
|
572
|
+
const n = [];
|
|
573
|
+
let t = u;
|
|
574
|
+
for (const [s, e] of [
|
|
575
|
+
// Note: The most englobing patterns must be first.
|
|
576
|
+
// Remove HTML comment.
|
|
577
|
+
[/<!--.*?-->/gs, ""],
|
|
578
|
+
// Remove <script> element.
|
|
579
|
+
[/<script.*?>.*?<\/script>/gis, ""],
|
|
580
|
+
// Remove <script> element.
|
|
581
|
+
[/<style.*?>.*?<\/style>/gis, ""],
|
|
582
|
+
// Ensure that there is always a space after "n°".
|
|
583
|
+
[/(\sn°)([^\s])/gi, "$1 $2"],
|
|
584
|
+
// Remove Sénat "pastillage":
|
|
585
|
+
// - \uF04B-\uF054 are circled numbers 0-9.
|
|
586
|
+
// - \uF031-\uF039 are left-half circled numbers 1-9.
|
|
587
|
+
// - \uF041-\uF04A are numbers 0-9 with a circle fragment on their top & bottom only.
|
|
588
|
+
// - \uF061-\uF06A are right-half circled numbers 0-9.
|
|
589
|
+
[/[\uF031-\uF039\uF041-\uF054\uF061-\uF06A]/g, ""]
|
|
590
|
+
]) {
|
|
591
|
+
const i = w(s, e)(t);
|
|
592
|
+
i.sourceMap.length !== 0 && (n.push(i), t = i.output);
|
|
593
|
+
}
|
|
594
|
+
return {
|
|
595
|
+
input: u,
|
|
596
|
+
output: t,
|
|
597
|
+
title: "Suppression des commentaires, scripts et styles HTML et nettoyage d'expressions",
|
|
598
|
+
transformations: n
|
|
599
|
+
};
|
|
600
|
+
}
|
|
601
|
+
function K({
|
|
602
|
+
removeAWithHref: u
|
|
603
|
+
} = {}) {
|
|
604
|
+
return (n) => H("Simplification du HTML", [
|
|
605
|
+
D,
|
|
606
|
+
G,
|
|
607
|
+
X,
|
|
608
|
+
j,
|
|
609
|
+
k({ removeAWithHref: u }),
|
|
610
|
+
Y
|
|
611
|
+
])(n);
|
|
612
|
+
}
|
|
613
|
+
function Y(u) {
|
|
614
|
+
const n = [];
|
|
615
|
+
let t = u;
|
|
616
|
+
for (const [s, e, i] of [
|
|
617
|
+
["Remplacement des espaces multiples par une espace unique", / +/g, " "],
|
|
618
|
+
["Suppression d'une espace en début de ligne", /^ /gm, ""],
|
|
619
|
+
["Suppression d'une espace en fin de ligne", / $/gm, ""],
|
|
620
|
+
[
|
|
621
|
+
"Remplacement des sauts de lignes multiples par un saut de ligne unique",
|
|
622
|
+
/\n\n+/g,
|
|
623
|
+
`
|
|
624
|
+
`
|
|
625
|
+
],
|
|
626
|
+
["Suppression d'un saut de ligne en début de texte", /^\n/g, ""],
|
|
627
|
+
["Suppression d'un saut de ligne en fin de texte", /\n$/g, ""]
|
|
628
|
+
]) {
|
|
629
|
+
let r = 0;
|
|
630
|
+
const c = [], o = t.replaceAll(e, (p, ...l) => {
|
|
631
|
+
const d = l.at(-2);
|
|
632
|
+
let g = i;
|
|
633
|
+
for (const [x, T] of l.slice(0, -2).entries())
|
|
634
|
+
g = g.replaceAll(`$${x + 1}`, T);
|
|
635
|
+
return c.push({
|
|
636
|
+
inputIndex: d,
|
|
637
|
+
inputLength: p.length,
|
|
638
|
+
outputIndex: d + r,
|
|
639
|
+
outputLength: g.length
|
|
640
|
+
}), r += g.length - p.length, g;
|
|
641
|
+
});
|
|
642
|
+
c.length !== 0 && (n.push({
|
|
643
|
+
input: t,
|
|
644
|
+
output: o,
|
|
645
|
+
sourceMap: c,
|
|
646
|
+
title: s
|
|
647
|
+
}), t = o);
|
|
648
|
+
}
|
|
649
|
+
return {
|
|
650
|
+
input: u,
|
|
651
|
+
output: t,
|
|
652
|
+
title: "Simplification du texte",
|
|
653
|
+
transformations: n
|
|
654
|
+
};
|
|
655
|
+
}
|
|
656
|
+
function j(u) {
|
|
657
|
+
const n = [];
|
|
658
|
+
let t = u;
|
|
659
|
+
for (const [e, i] of [
|
|
660
|
+
// Replace U+00A0 (no-break space) and tab with a normal space.
|
|
661
|
+
[/[ \t]/g, " "],
|
|
662
|
+
// Replace three non-ASCII dashes (U+2010, U+2011 et U+2013) with a minus sign.
|
|
663
|
+
[/[‐‑–]/g, "-"],
|
|
664
|
+
// Replace non-ASCII apostrophe.
|
|
665
|
+
[/’/g, "'"],
|
|
666
|
+
// Replace İ (I with a point) with normal I.
|
|
667
|
+
// The İ can be used, probably to differentiate the letter I from the Roman numeral I.
|
|
668
|
+
// For example: Article 199 decies İ of the General Tax Code.
|
|
669
|
+
// But Légifrance uses a classic I…
|
|
670
|
+
["İ", "I"]
|
|
671
|
+
])
|
|
672
|
+
t = t.replaceAll(e, (r, ...c) => {
|
|
673
|
+
const o = c.at(-2), p = {
|
|
674
|
+
inputIndex: o,
|
|
675
|
+
inputLength: 1,
|
|
676
|
+
outputIndex: o,
|
|
677
|
+
// Note: `outputIndex` is added below.
|
|
678
|
+
outputLength: 1
|
|
679
|
+
}, l = n.findIndex(
|
|
680
|
+
(d) => d.inputIndex > o
|
|
681
|
+
);
|
|
682
|
+
return l === -1 ? n.push(p) : n.splice(l, 0, p), i;
|
|
683
|
+
});
|
|
684
|
+
let s = 0;
|
|
685
|
+
for (const e of n)
|
|
686
|
+
e.outputIndex = e.inputIndex + s, s += e.outputLength - e.inputLength;
|
|
687
|
+
return {
|
|
688
|
+
input: u,
|
|
689
|
+
output: t,
|
|
690
|
+
sourceMap: n,
|
|
691
|
+
title: "Simplification des caractères unicodes"
|
|
692
|
+
};
|
|
693
|
+
}
|
|
694
|
+
export {
|
|
695
|
+
H as chainTransformers,
|
|
696
|
+
k as convertHtmlElementsToText,
|
|
697
|
+
D as decodeNamedHtmlEntities,
|
|
698
|
+
G as decodeNumericHtmlEntities,
|
|
699
|
+
U as iterOriginalMergedPositionsFromTransformed,
|
|
700
|
+
q as originalMergedPositionsFromTransformed,
|
|
701
|
+
W as originalSplitPositionsFromTransformed,
|
|
702
|
+
w as replacePattern,
|
|
703
|
+
X as replacePatterns,
|
|
704
|
+
K as simplifyHtml,
|
|
705
|
+
Y as simplifyText,
|
|
706
|
+
j as simplifyUnicodeCharacters
|
|
707
|
+
};
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function assertNever(type: string, value: never): never;
|
|
@@ -0,0 +1,3 @@
|
|
|
1
|
+
export type { TextPosition } from './text_parsers/positions.js';
|
|
2
|
+
export { convertHtmlElementsToText, decodeNamedHtmlEntities, decodeNumericHtmlEntities, replacePattern, replacePatterns, simplifyHtml, simplifyText, simplifyUnicodeCharacters, } from './text_parsers/simplifiers.js';
|
|
3
|
+
export { chainTransformers, iterOriginalMergedPositionsFromTransformed, originalMergedPositionsFromTransformed, originalSplitPositionsFromTransformed, type FragmentReverseTransformation, type SourceMapSegment, type Transformation, type TransformationLeaf, type TransformationNode, type Transformer, type TransformerLeaf, type TransformerNode, } from './text_parsers/transformers.js';
|