@tricoteuses/senat 2.20.17 → 2.20.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +58 -19
- package/lib/git.d.ts +26 -0
- package/lib/git.js +167 -0
- package/lib/index.d.ts +1 -1
- package/lib/loaders.d.ts +3 -2
- package/lib/model/commission.d.ts +2 -2
- package/lib/model/commission.js +5 -4
- package/lib/model/seance.d.ts +2 -8
- package/lib/model/seance.js +28 -113
- package/lib/model/util.d.ts +0 -4
- package/lib/model/util.js +0 -38
- package/lib/scripts/convert_data.js +25 -1
- package/lib/scripts/retrieve_agenda.js +7 -18
- package/lib/scripts/retrieve_cr_commission.js +1 -10
- package/lib/scripts/retrieve_cr_seance.d.ts +1 -1
- package/lib/scripts/retrieve_cr_seance.js +183 -127
- package/lib/scripts/retrieve_videos.d.ts +1 -1
- package/lib/scripts/retrieve_videos.js +46 -92
- package/lib/scripts/shared/cli_helpers.d.ts +25 -3
- package/lib/scripts/shared/cli_helpers.js +28 -0
- package/lib/types/agenda.d.ts +5 -6
- package/lib/utils/cr_spliting.d.ts +2 -10
- package/lib/utils/cr_spliting.js +2 -119
- package/lib/utils/date.d.ts +10 -0
- package/lib/utils/date.js +100 -0
- package/lib/utils/reunion_odj_building.d.ts +2 -2
- package/lib/utils/reunion_odj_building.js +8 -12
- package/lib/utils/reunion_parsing.d.ts +23 -0
- package/lib/utils/reunion_parsing.js +209 -0
- package/lib/utils/scoring.d.ts +14 -0
- package/lib/utils/scoring.js +147 -0
- package/lib/utils/string_cleaning.d.ts +7 -0
- package/lib/utils/string_cleaning.js +57 -0
- package/package.json +1 -1
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import { DateTime } from "luxon";
|
|
2
|
+
import { AgendaEvent, Reunion } from "../types/agenda";
|
|
3
|
+
import { DossierLegislatifResult } from "../model/dosleg";
|
|
4
|
+
import * as cheerio from "cheerio";
|
|
5
|
+
type KnownType = "SP" | "COM" | "MC" | "OD" | "ID";
|
|
6
|
+
type DossierBySenatUrl = Record<string, DossierLegislatifResult>;
|
|
7
|
+
type ReunionBucket = "IDS" | "IDC" | "IDM" | "IDO" | "IDI";
|
|
8
|
+
type ReunionsByBucket = Record<ReunionBucket, Reunion[]>;
|
|
9
|
+
export declare function buildReunionsByBucket(events: AgendaEvent[], dossierBySenatUrl: DossierBySenatUrl): ReunionsByBucket;
|
|
10
|
+
export declare function makeReunionUid(dateISO: string, kind: KnownType, agendaEventId: string, organe?: string | null): string;
|
|
11
|
+
export declare function formatYYYYMMDD(dateYYYYMMDD: string): string;
|
|
12
|
+
export declare function deriveTimesForEvent(ev: AgendaEvent): {
|
|
13
|
+
startISO: string | null;
|
|
14
|
+
endISO: string | null;
|
|
15
|
+
};
|
|
16
|
+
export type SommaireBlock = {
|
|
17
|
+
text: string;
|
|
18
|
+
startIndex: number;
|
|
19
|
+
targetId?: string | null;
|
|
20
|
+
};
|
|
21
|
+
export declare function extractSommaireBlocks($: cheerio.CheerioAPI, idx: Map<any, number>): SommaireBlock[];
|
|
22
|
+
export declare function parseISO(iso: string | null | undefined): DateTime | null;
|
|
23
|
+
export {};
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
import { DateTime } from "luxon";
|
|
2
|
+
import { buildOdj } from "./reunion_odj_building";
|
|
3
|
+
import { norm } from "./string_cleaning";
|
|
4
|
+
const PARIS = "Europe/Paris";
|
|
5
|
+
const STOPWORDS = new Set([
|
|
6
|
+
"de",
|
|
7
|
+
"du",
|
|
8
|
+
"des",
|
|
9
|
+
"la",
|
|
10
|
+
"le",
|
|
11
|
+
"les",
|
|
12
|
+
"l",
|
|
13
|
+
"d",
|
|
14
|
+
"et",
|
|
15
|
+
"en",
|
|
16
|
+
"au",
|
|
17
|
+
"aux",
|
|
18
|
+
"pour",
|
|
19
|
+
"sur",
|
|
20
|
+
"sous",
|
|
21
|
+
"à",
|
|
22
|
+
"a",
|
|
23
|
+
"aux",
|
|
24
|
+
]);
|
|
25
|
+
function toReunion(e, dossierBySenatUrl, uid) {
|
|
26
|
+
const date = norm(e.date) ?? e.date;
|
|
27
|
+
const { startISO, endISO } = deriveTimesForEvent(e);
|
|
28
|
+
const startTime = startISO ?? e.startTime ?? null;
|
|
29
|
+
const endTime = endISO ?? e.endTime ?? null;
|
|
30
|
+
return {
|
|
31
|
+
uid,
|
|
32
|
+
chambre: "SN",
|
|
33
|
+
date,
|
|
34
|
+
type: e.type || "",
|
|
35
|
+
organe: e.organe || undefined,
|
|
36
|
+
startTime,
|
|
37
|
+
endTime,
|
|
38
|
+
captationVideo: e.captationVideo === true,
|
|
39
|
+
titre: e.titre,
|
|
40
|
+
objet: e.objet || "",
|
|
41
|
+
events: [e], // TODO remove
|
|
42
|
+
odj: buildOdj([e], dossierBySenatUrl),
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
export function buildReunionsByBucket(events, dossierBySenatUrl) {
|
|
46
|
+
const out = { IDS: [], IDC: [], IDM: [], IDO: [], IDI: [] };
|
|
47
|
+
if (!events?.length)
|
|
48
|
+
return out;
|
|
49
|
+
for (const e of events) {
|
|
50
|
+
const kind = classifyAgendaType(e?.type);
|
|
51
|
+
if (!kind) {
|
|
52
|
+
continue;
|
|
53
|
+
console.warn("Can't determine type of reunion");
|
|
54
|
+
}
|
|
55
|
+
const bucket = typeToSuffixStrict(kind);
|
|
56
|
+
const uid = makeReunionUid(e.date, kind, e.id, e.organe ?? null);
|
|
57
|
+
out[bucket].push(toReunion(e, dossierBySenatUrl, uid));
|
|
58
|
+
}
|
|
59
|
+
// Tri stable par bucket (date + heure, inconnus à la fin)
|
|
60
|
+
for (const k of Object.keys(out)) {
|
|
61
|
+
out[k].sort((a, b) => {
|
|
62
|
+
const da = DateTime.fromISO(`${a.date}T${a.startTime || "23:59:59.999+02:00"}`, { zone: PARIS }).toMillis();
|
|
63
|
+
const db = DateTime.fromISO(`${b.date}T${b.startTime || "23:59:59.999+02:00"}`, { zone: PARIS }).toMillis();
|
|
64
|
+
return da - db || (a.organe || "").localeCompare(b.organe || "") || (a.titre || "").localeCompare(b.titre || "");
|
|
65
|
+
});
|
|
66
|
+
}
|
|
67
|
+
return out;
|
|
68
|
+
}
|
|
69
|
+
function normalizeNoAccents(s) {
|
|
70
|
+
return (s || "")
|
|
71
|
+
.trim()
|
|
72
|
+
.normalize("NFKD")
|
|
73
|
+
.replace(/[\u0300-\u036f]/g, "");
|
|
74
|
+
}
|
|
75
|
+
function classifyAgendaType(typeLabel) {
|
|
76
|
+
const s = normalizeNoAccents(typeLabel || "").toLowerCase();
|
|
77
|
+
if (/\bseance\b.*\bpublique\b/.test(s))
|
|
78
|
+
return "SP";
|
|
79
|
+
if (/\bcommissions\b/.test(s))
|
|
80
|
+
return "COM";
|
|
81
|
+
if (/\bmission\b.*\bcontrole\b/.test(s))
|
|
82
|
+
return "MC";
|
|
83
|
+
if (/\boffices\b|\bdelegations\b/.test(s))
|
|
84
|
+
return "OD";
|
|
85
|
+
if (/\instances\b|\decisionelles\b/.test(s))
|
|
86
|
+
return "ID";
|
|
87
|
+
return null;
|
|
88
|
+
}
|
|
89
|
+
function typeToSuffixStrict(kind) {
|
|
90
|
+
switch (kind) {
|
|
91
|
+
case "SP":
|
|
92
|
+
return "IDS";
|
|
93
|
+
case "COM":
|
|
94
|
+
return "IDC";
|
|
95
|
+
case "MC":
|
|
96
|
+
return "IDM";
|
|
97
|
+
case "OD":
|
|
98
|
+
return "IDO";
|
|
99
|
+
case "ID":
|
|
100
|
+
return "IDI";
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
function organeInitials(input, maxLen = 8) {
|
|
104
|
+
if (!input)
|
|
105
|
+
return "";
|
|
106
|
+
const clean = normalizeNoAccents(input)
|
|
107
|
+
.replace(/['’]/g, " ")
|
|
108
|
+
.replace(/[^A-Za-z0-9\s]/g, " ")
|
|
109
|
+
.replace(/\s+/g, " ")
|
|
110
|
+
.trim();
|
|
111
|
+
if (!clean)
|
|
112
|
+
return "";
|
|
113
|
+
const parts = clean.split(" ");
|
|
114
|
+
const letters = [];
|
|
115
|
+
for (const raw of parts) {
|
|
116
|
+
const w = raw.toLowerCase();
|
|
117
|
+
if (!w)
|
|
118
|
+
continue;
|
|
119
|
+
if (STOPWORDS.has(w))
|
|
120
|
+
continue;
|
|
121
|
+
// Take two first letter if alphanumeric
|
|
122
|
+
const two = raw.slice(0, 2);
|
|
123
|
+
if (/[A-Za-z0-9]/.test(two))
|
|
124
|
+
letters.push(two.toUpperCase());
|
|
125
|
+
}
|
|
126
|
+
const out = letters.join("");
|
|
127
|
+
return out.slice(0, maxLen);
|
|
128
|
+
}
|
|
129
|
+
export function makeReunionUid(dateISO, kind, agendaEventId, organe) {
|
|
130
|
+
const ymd = dateISO ? formatYYYYMMDD(dateISO) : "00000000";
|
|
131
|
+
const suffix = typeToSuffixStrict(kind);
|
|
132
|
+
const org = organe && suffix !== "IDS" ? organeInitials(organe) : "";
|
|
133
|
+
let base = `RUSN${ymd}${suffix}${org}${agendaEventId}`;
|
|
134
|
+
return base;
|
|
135
|
+
}
|
|
136
|
+
export function formatYYYYMMDD(dateYYYYMMDD) {
|
|
137
|
+
const [y, m, d] = dateYYYYMMDD.split("-");
|
|
138
|
+
return `${y}${m}${d}`;
|
|
139
|
+
}
|
|
140
|
+
// Extract hours/minutes from French text like "à 10 h 30", "de 10 h à 12 h", etc.
|
|
141
|
+
function parseTimeOriginalFR(timeOriginal) {
|
|
142
|
+
if (!timeOriginal)
|
|
143
|
+
return { start: null, end: null };
|
|
144
|
+
const txt = (timeOriginal || "")
|
|
145
|
+
.replace(/\u00A0/g, " ") // nbsp → space
|
|
146
|
+
.replace(/\s+/g, " ") // espaces multiples
|
|
147
|
+
.toLowerCase()
|
|
148
|
+
.trim();
|
|
149
|
+
// 1) "de 10 h 30 à 12 heures", "de 10h30 à 12h", "de 9 h à 11 h 15", etc.
|
|
150
|
+
const reRange = /\bde\s+(\d{1,2})\s*(?:h|:)?\s*(\d{1,2})?\s*(?:heures?)?\s*à\s*(\d{1,2})\s*(?:h|:)?\s*(\d{1,2})?\s*(?:heures?)?/i;
|
|
151
|
+
const mRange = txt.match(reRange);
|
|
152
|
+
if (mRange) {
|
|
153
|
+
const h1 = clampHour(+mRange[1]), m1 = clampMinute(mRange[2] ? +mRange[2] : 0);
|
|
154
|
+
const h2 = clampHour(+mRange[3]), m2 = clampMinute(mRange[4] ? +mRange[4] : 0);
|
|
155
|
+
return { start: toIsoTime(h1, m1), end: toIsoTime(h2, m2) };
|
|
156
|
+
}
|
|
157
|
+
// 2) "à 10 h 30", "à 10h", "A 10h30", "A 9 heures", etc.
|
|
158
|
+
const reAt = /\b(?:a|à)\s*(\d{1,2})\s*(?:h|:)?\s*(\d{1,2})?\s*(?:heures?)?/i;
|
|
159
|
+
const mAt = txt.match(reAt);
|
|
160
|
+
if (mAt) {
|
|
161
|
+
const h = clampHour(+mAt[1]), m = clampMinute(mAt[2] ? +mAt[2] : 0);
|
|
162
|
+
return { start: toIsoTime(h, m), end: null };
|
|
163
|
+
}
|
|
164
|
+
// 3) "10 h 30", "15h", "9 heures" sans 'à' / 'de ... à ...'
|
|
165
|
+
const reBare = /\b(\d{1,2})\s*(?:h|:)?\s*(\d{1,2})?\s*(?:heures?)?\b/;
|
|
166
|
+
const mBare = txt.match(reBare);
|
|
167
|
+
if (mBare) {
|
|
168
|
+
const h = clampHour(+mBare[1]), m = clampMinute(mBare[2] ? +mBare[2] : 0);
|
|
169
|
+
return { start: toIsoTime(h, m), end: null };
|
|
170
|
+
}
|
|
171
|
+
return { start: null, end: null };
|
|
172
|
+
}
|
|
173
|
+
function clampHour(h) {
|
|
174
|
+
return Math.max(0, Math.min(23, h));
|
|
175
|
+
}
|
|
176
|
+
function clampMinute(m) {
|
|
177
|
+
return Math.max(0, Math.min(59, m));
|
|
178
|
+
}
|
|
179
|
+
function toIsoTime(h, m) {
|
|
180
|
+
return `${String(h).padStart(2, "0")}:${String(m).padStart(2, "0")}:00.000+02:00`;
|
|
181
|
+
}
|
|
182
|
+
export function deriveTimesForEvent(ev) {
|
|
183
|
+
const directStart = ev.startTime ?? null;
|
|
184
|
+
const directEnd = ev.endTime ?? null;
|
|
185
|
+
const fromText = parseTimeOriginalFR(ev.timeOriginal);
|
|
186
|
+
const startISO = directStart ?? fromText.start ?? null;
|
|
187
|
+
const endISO = directEnd ?? fromText.end ?? null;
|
|
188
|
+
return { startISO, endISO };
|
|
189
|
+
}
|
|
190
|
+
export function extractSommaireBlocks($, idx) {
|
|
191
|
+
const blocks = [];
|
|
192
|
+
// lignes du sommaire avec lien
|
|
193
|
+
$("cri\\:tm5 a[href^='#'], cri\\:tm3 a[href^='#'], p.tm5 a[href^='#'], p.tm3 a[href^='#']").each((_, a) => {
|
|
194
|
+
const href = $(a).attr("href") || "";
|
|
195
|
+
const targetId = href.startsWith("#") ? href.slice(1) : null;
|
|
196
|
+
const text = norm($(a).text() || "");
|
|
197
|
+
const startIndex = idx.get(a) ?? idx.get($(a).closest("p")[0]) ?? null;
|
|
198
|
+
if (!text || startIndex == null)
|
|
199
|
+
return;
|
|
200
|
+
blocks.push({ text, startIndex, targetId });
|
|
201
|
+
});
|
|
202
|
+
return blocks;
|
|
203
|
+
}
|
|
204
|
+
export function parseISO(iso) {
|
|
205
|
+
if (!iso)
|
|
206
|
+
return null;
|
|
207
|
+
const dt = DateTime.fromISO(iso, { setZone: true, zone: PARIS });
|
|
208
|
+
return dt.isValid ? dt : null;
|
|
209
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import { AgendaEvent, Reunion } from "../types/agenda";
|
|
2
|
+
export declare function jaccard(a: Set<string>, b: Set<string>): number;
|
|
3
|
+
export declare function jaccardTokenSim(a: string, b: string): number;
|
|
4
|
+
export declare function isNoiseBlock(text: string): boolean;
|
|
5
|
+
/**
|
|
6
|
+
* Score robuste pour "bloc court vs event long":
|
|
7
|
+
* - coverageBloc = |A∩B| / |A| (critère principal)
|
|
8
|
+
* - jaccard en secondaire (utile quand les 2 sont longs)
|
|
9
|
+
* - bonus time optionnel (déjà chez toi)
|
|
10
|
+
*/
|
|
11
|
+
export declare function scoreSommaireBlockForEvent(blockText: string, ev: AgendaEvent): number;
|
|
12
|
+
export declare function scoreVideo(agenda: Reunion, agendaTs: number | null, sameOrg: boolean, videoTitle?: string, videoEpoch?: number, videoOrganes?: string[], timeAmbigious?: boolean): number;
|
|
13
|
+
export declare function dice(a: string, b: string): number;
|
|
14
|
+
export declare function normalize(s?: string | null): string;
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
import { parseISO } from "./reunion_parsing";
|
|
2
|
+
import { normalizeText } from "./string_cleaning";
|
|
3
|
+
export function jaccard(a, b) {
|
|
4
|
+
if (!a.size || !b.size)
|
|
5
|
+
return 0;
|
|
6
|
+
let inter = 0;
|
|
7
|
+
for (const t of a)
|
|
8
|
+
if (b.has(t))
|
|
9
|
+
inter++;
|
|
10
|
+
return inter / (a.size + b.size - inter);
|
|
11
|
+
}
|
|
12
|
+
export function jaccardTokenSim(a, b) {
|
|
13
|
+
const A = new Set(normalizeText(a).split(" ").filter(Boolean));
|
|
14
|
+
const B = new Set(normalizeText(b).split(" ").filter(Boolean));
|
|
15
|
+
if (A.size === 0 || B.size === 0)
|
|
16
|
+
return 0;
|
|
17
|
+
let inter = 0;
|
|
18
|
+
for (const x of A)
|
|
19
|
+
if (B.has(x))
|
|
20
|
+
inter++;
|
|
21
|
+
return inter / (A.size + B.size - inter);
|
|
22
|
+
}
|
|
23
|
+
export function isNoiseBlock(text) {
|
|
24
|
+
const t = normalizeText(text);
|
|
25
|
+
// très génériques / institutionnels / bruit
|
|
26
|
+
if (!t)
|
|
27
|
+
return true;
|
|
28
|
+
if (t.length < 6)
|
|
29
|
+
return true;
|
|
30
|
+
// Ex: "article 78", "ord re du jour", "organisation des travaux"
|
|
31
|
+
if (/^article\s+\d+/.test(t))
|
|
32
|
+
return true;
|
|
33
|
+
if (/\b(organisation des travaux|ordre du jour|suspension|reprise de la seance)\b/.test(t))
|
|
34
|
+
return true;
|
|
35
|
+
if (/\b(vice presidente|president|secretaire|ministre|rapporteur)\b/.test(t))
|
|
36
|
+
return true;
|
|
37
|
+
return false;
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* Score robuste pour "bloc court vs event long":
|
|
41
|
+
* - coverageBloc = |A∩B| / |A| (critère principal)
|
|
42
|
+
* - jaccard en secondaire (utile quand les 2 sont longs)
|
|
43
|
+
* - bonus time optionnel (déjà chez toi)
|
|
44
|
+
*/
|
|
45
|
+
export function scoreSommaireBlockForEvent(blockText, ev) {
|
|
46
|
+
const evText = `${ev.titre ?? ""} ${ev.objet ?? ""}`;
|
|
47
|
+
const A = tokens(blockText); // tokens du bloc
|
|
48
|
+
const B = tokens(evText); // tokens de l'event
|
|
49
|
+
if (A.size === 0 || B.size === 0)
|
|
50
|
+
return 0;
|
|
51
|
+
const inter = intersectionSize(A, B);
|
|
52
|
+
const coverageBloc = inter / A.size; // 🔥 clé
|
|
53
|
+
const jac = jaccard(A, B);
|
|
54
|
+
// bonus inclusion brut (utile sur des expressions exactes)
|
|
55
|
+
const bNorm = normalizeText(blockText);
|
|
56
|
+
const eNorm = normalizeText(evText);
|
|
57
|
+
const inclusion = bNorm.length >= 10 && eNorm.includes(bNorm) ? 0.12 : 0;
|
|
58
|
+
// bonus heure (si tu l'as)
|
|
59
|
+
const timeBonus = timeProximityBonus(ev.startTime ?? null, blockText);
|
|
60
|
+
// combine: coverage domine
|
|
61
|
+
const score = Math.max(coverageBloc, jac) * 0.85 + Math.min(1, jac) * 0.1 + inclusion + timeBonus;
|
|
62
|
+
return Math.max(0, Math.min(1, score));
|
|
63
|
+
}
|
|
64
|
+
function intersectionSize(a, b) {
|
|
65
|
+
let inter = 0;
|
|
66
|
+
for (const x of a)
|
|
67
|
+
if (b.has(x))
|
|
68
|
+
inter++;
|
|
69
|
+
return inter;
|
|
70
|
+
}
|
|
71
|
+
function tokens(s) {
|
|
72
|
+
const stop = new Set(["de", "du", "des", "la", "le", "les", "et", "au", "aux", "sur", "en", "d", "l", "un", "une"]);
|
|
73
|
+
return new Set(normalizeText(s)
|
|
74
|
+
.split(" ")
|
|
75
|
+
.filter((w) => w.length >= 3 && !stop.has(w)));
|
|
76
|
+
}
|
|
77
|
+
function timeProximityBonus(eventStartISO, blockText) {
|
|
78
|
+
if (!eventStartISO)
|
|
79
|
+
return 0;
|
|
80
|
+
const dt = parseISO(eventStartISO);
|
|
81
|
+
if (!dt)
|
|
82
|
+
return 0;
|
|
83
|
+
const eventMin = dt.hour * 60 + dt.minute;
|
|
84
|
+
const hints = extractHourHints(blockText);
|
|
85
|
+
if (!hints.length)
|
|
86
|
+
return 0;
|
|
87
|
+
let best = Infinity;
|
|
88
|
+
for (const x of hints)
|
|
89
|
+
best = Math.min(best, Math.abs(x.h * 60 + x.m - eventMin));
|
|
90
|
+
if (best <= 30)
|
|
91
|
+
return 0.1;
|
|
92
|
+
if (best <= 60)
|
|
93
|
+
return 0.06;
|
|
94
|
+
if (best <= 120)
|
|
95
|
+
return 0.03;
|
|
96
|
+
return 0;
|
|
97
|
+
}
|
|
98
|
+
function extractHourHints(text) {
|
|
99
|
+
const t = (text || "").toLowerCase();
|
|
100
|
+
const out = [];
|
|
101
|
+
const re = /\b(\d{1,2})\s*h\s*(\d{2})\b/g;
|
|
102
|
+
let m;
|
|
103
|
+
while ((m = re.exec(t)))
|
|
104
|
+
out.push({ h: Number(m[1]), m: Number(m[2]) });
|
|
105
|
+
return out;
|
|
106
|
+
}
|
|
107
|
+
export function scoreVideo(agenda, agendaTs, sameOrg, videoTitle, videoEpoch, videoOrganes, timeAmbigious = false) {
|
|
108
|
+
const objetS = dice(agenda.objet || "", videoTitle || "");
|
|
109
|
+
const titleS = dice(agenda.titre || "", videoTitle || "");
|
|
110
|
+
const titleScore = Math.max(objetS, titleS);
|
|
111
|
+
let timeScore = 0;
|
|
112
|
+
if (agendaTs && videoEpoch) {
|
|
113
|
+
const deltaMin = Math.abs(videoEpoch - agendaTs) / 60;
|
|
114
|
+
timeScore = Math.exp(-deltaMin / 60);
|
|
115
|
+
}
|
|
116
|
+
let orgScore = 0;
|
|
117
|
+
if (agenda.organe && videoOrganes && videoOrganes.length) {
|
|
118
|
+
const agendaOrg = agenda.organe;
|
|
119
|
+
orgScore = Math.max(...videoOrganes.map((v) => dice(agendaOrg, v)));
|
|
120
|
+
}
|
|
121
|
+
if (timeAmbigious) {
|
|
122
|
+
return 0.6 * titleScore + (sameOrg ? 0.4 : orgScore * 0.4);
|
|
123
|
+
}
|
|
124
|
+
return 0.3 * titleScore + 0.3 * timeScore + (sameOrg ? 0.4 : orgScore * 0.4);
|
|
125
|
+
}
|
|
126
|
+
function tokensDice(s) {
|
|
127
|
+
return normalize(s).split(" ").filter(Boolean);
|
|
128
|
+
}
|
|
129
|
+
export function dice(a, b) {
|
|
130
|
+
const A = new Set(tokensDice(a)), B = new Set(tokensDice(b));
|
|
131
|
+
if (!A.size || !B.size)
|
|
132
|
+
return 0;
|
|
133
|
+
let inter = 0;
|
|
134
|
+
for (const t of A)
|
|
135
|
+
if (B.has(t))
|
|
136
|
+
inter++;
|
|
137
|
+
return (2 * inter) / (A.size + B.size);
|
|
138
|
+
}
|
|
139
|
+
export function normalize(s) {
|
|
140
|
+
return (s ?? "")
|
|
141
|
+
.toLowerCase()
|
|
142
|
+
.normalize("NFD")
|
|
143
|
+
.replace(/[\u0300-\u036f]/g, "")
|
|
144
|
+
.replace(/[^\p{L}\p{N}\s-]/gu, " ")
|
|
145
|
+
.replace(/\s+/g, " ")
|
|
146
|
+
.trim();
|
|
147
|
+
}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
export declare function normalizeText(t: string): string;
|
|
2
|
+
export declare function decodeHtmlEntities(s?: string | null): string;
|
|
3
|
+
export declare function stripTrailingPunct(s: string): string;
|
|
4
|
+
export declare function dedupeSpeaker(raw: string): string;
|
|
5
|
+
export declare function fixApostrophes(s: string): string;
|
|
6
|
+
export declare function norm(s?: string | null): string;
|
|
7
|
+
export declare function normalizeSpaces(s: string): string;
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
export function normalizeText(t) {
|
|
2
|
+
return (t || "")
|
|
3
|
+
.toLowerCase()
|
|
4
|
+
.normalize("NFD")
|
|
5
|
+
.replace(/\p{Diacritic}/gu, "")
|
|
6
|
+
.replace(/[^a-z0-9\s]/g, " ")
|
|
7
|
+
.replace(/\s+/g, " ")
|
|
8
|
+
.trim();
|
|
9
|
+
}
|
|
10
|
+
export function decodeHtmlEntities(s) {
|
|
11
|
+
if (!s)
|
|
12
|
+
return "";
|
|
13
|
+
return s
|
|
14
|
+
.replace(/&#x([0-9a-fA-F]+);/g, (_, h) => String.fromCodePoint(parseInt(h, 16)))
|
|
15
|
+
.replace(/&#(\d+);/g, (_, d) => String.fromCodePoint(parseInt(d, 10)))
|
|
16
|
+
.replace(/&/g, "&")
|
|
17
|
+
.replace(/</g, "<")
|
|
18
|
+
.replace(/>/g, ">")
|
|
19
|
+
.replace(/"/g, '"')
|
|
20
|
+
.replace(/'/g, "'");
|
|
21
|
+
}
|
|
22
|
+
export function stripTrailingPunct(s) {
|
|
23
|
+
return s.replace(/\s*([:,.;])\s*$/u, "").trim();
|
|
24
|
+
}
|
|
25
|
+
export function dedupeSpeaker(raw) {
|
|
26
|
+
let s = norm(raw);
|
|
27
|
+
s = stripTrailingPunct(s);
|
|
28
|
+
const dupPatterns = [/^(.+?)\s*[.]\s*\1$/u, /^(.+?)\s*,\s*\1,?$/u, /^(.+?)\s+\1$/u];
|
|
29
|
+
for (const re of dupPatterns) {
|
|
30
|
+
const m = s.match(re);
|
|
31
|
+
if (m) {
|
|
32
|
+
s = m[1];
|
|
33
|
+
break;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
return s.replace(/\.\s*$/, "");
|
|
37
|
+
}
|
|
38
|
+
export function fixApostrophes(s) {
|
|
39
|
+
let out = s;
|
|
40
|
+
out = out.replace(/\s*’\s*/g, "’");
|
|
41
|
+
out = out.replace(/\b([dljctmsn])\s*’/gi, (_, m) => m + "’");
|
|
42
|
+
out = out.replace(/’\s+([A-Za-zÀ-ÖØ-öø-ÿ])/g, "’$1");
|
|
43
|
+
out = out.replace(/\s+([,;:.!?])/g, "$1");
|
|
44
|
+
return out;
|
|
45
|
+
}
|
|
46
|
+
export function norm(s) {
|
|
47
|
+
return (s || "")
|
|
48
|
+
.replace(/\u00A0/g, " ")
|
|
49
|
+
.replace(/\s+/g, " ")
|
|
50
|
+
.trim();
|
|
51
|
+
}
|
|
52
|
+
export function normalizeSpaces(s) {
|
|
53
|
+
return s
|
|
54
|
+
.replace(/\u00A0/g, " ")
|
|
55
|
+
.replace(/\s+/g, " ")
|
|
56
|
+
.trim();
|
|
57
|
+
}
|