@tricoteuses/senat 2.20.18 → 2.20.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/index.d.ts +1 -1
- package/lib/loaders.d.ts +2 -2
- package/lib/model/commission.d.ts +2 -2
- package/lib/model/commission.js +5 -4
- package/lib/model/seance.d.ts +2 -8
- package/lib/model/seance.js +28 -113
- package/lib/model/util.d.ts +0 -4
- package/lib/model/util.js +0 -38
- package/lib/scripts/retrieve_agenda.js +7 -18
- package/lib/scripts/retrieve_cr_commission.js +1 -10
- package/lib/scripts/retrieve_cr_seance.d.ts +1 -1
- package/lib/scripts/retrieve_cr_seance.js +183 -127
- package/lib/scripts/retrieve_videos.d.ts +1 -1
- package/lib/scripts/retrieve_videos.js +46 -92
- package/lib/types/agenda.d.ts +5 -6
- package/lib/utils/cr_spliting.d.ts +2 -10
- package/lib/utils/cr_spliting.js +2 -119
- package/lib/utils/date.d.ts +10 -0
- package/lib/utils/date.js +100 -0
- package/lib/utils/reunion_odj_building.d.ts +2 -2
- package/lib/utils/reunion_odj_building.js +8 -12
- package/lib/utils/reunion_parsing.d.ts +23 -0
- package/lib/utils/reunion_parsing.js +209 -0
- package/lib/utils/scoring.d.ts +14 -0
- package/lib/utils/scoring.js +147 -0
- package/lib/utils/string_cleaning.d.ts +7 -0
- package/lib/utils/string_cleaning.js +57 -0
- package/package.json +1 -1
package/lib/index.d.ts
CHANGED
|
@@ -3,7 +3,7 @@ export type { DossierLegislatifResult } from "./model/dosleg";
|
|
|
3
3
|
export type { ScrutinResult } from "./model/scrutins";
|
|
4
4
|
export type { QuestionResult } from "./model/questions";
|
|
5
5
|
export type { CirconscriptionResult, OrganismeResult, SenateurResult } from "./model/sens";
|
|
6
|
-
export type { AgendaEvent,
|
|
6
|
+
export type { AgendaEvent, Reunion, ReunionOdjPoint } from "./types/agenda";
|
|
7
7
|
export type { Ses, Sub, TxtAmeli } from "./types/ameli";
|
|
8
8
|
export type { CompteRendu } from "./types/compte_rendu";
|
|
9
9
|
export type { Debat, LecAssDeb } from "./types/debats";
|
package/lib/loaders.d.ts
CHANGED
|
@@ -4,7 +4,7 @@ import { DossierLegislatifResult } from "./model/dosleg";
|
|
|
4
4
|
import { QuestionResult } from "./model/questions";
|
|
5
5
|
import { ScrutinResult } from "./model/scrutins";
|
|
6
6
|
import { CirconscriptionResult, OrganismeResult, SenateurResult } from "./model/sens";
|
|
7
|
-
import {
|
|
7
|
+
import { Reunion } from "./types/agenda";
|
|
8
8
|
import { FlatTexte } from "./types/texte";
|
|
9
9
|
import { CompteRendu } from "./types/compte_rendu";
|
|
10
10
|
export { EnabledDatasets } from "./datasets";
|
|
@@ -87,7 +87,7 @@ export declare function loadSenatTexteContent(dataDir: string, textePathFromData
|
|
|
87
87
|
export declare function loadSenatCompteRenduContent(dataDir: string, session: number, debatId: string | number): {
|
|
88
88
|
item: CompteRendu | null;
|
|
89
89
|
};
|
|
90
|
-
export declare function iterLoadSenatAgendas(dataDir: string, session: number | undefined): Generator<IterItem<
|
|
90
|
+
export declare function iterLoadSenatAgendas(dataDir: string, session: number | undefined): Generator<IterItem<Reunion>>;
|
|
91
91
|
export declare function iterLoadSenatCirconscriptions(dataDir: string, options?: {}): Generator<IterItem<CirconscriptionResult>>;
|
|
92
92
|
export declare function iterLoadSenatOrganismes(dataDir: string, options?: {}): Generator<IterItem<OrganismeResult>>;
|
|
93
93
|
export declare function iterLoadSenatSenateurs(dataDir: string, options?: {}): Generator<IterItem<SenateurResult>>;
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import * as cheerio from "cheerio";
|
|
2
2
|
import { CompteRendu } from "../types/compte_rendu";
|
|
3
|
-
import {
|
|
3
|
+
import { Reunion } from "../types/agenda";
|
|
4
4
|
export declare function getRemainingTextAfterSpeakerHeader($: cheerio.CheerioAPI, $p: cheerio.Cheerio<any>): string;
|
|
5
5
|
export type DaySection = {
|
|
6
6
|
title: string;
|
|
@@ -14,5 +14,5 @@ export declare function parseCommissionCRSectionFromDom($: cheerio.CheerioAPI, h
|
|
|
14
14
|
hourShort: string | null;
|
|
15
15
|
organe?: string | null;
|
|
16
16
|
section: DaySection;
|
|
17
|
-
matched?:
|
|
17
|
+
matched?: Reunion;
|
|
18
18
|
}): CompteRendu | null;
|
package/lib/model/commission.js
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import * as cheerio from "cheerio";
|
|
2
2
|
import path from "path";
|
|
3
|
-
import {
|
|
4
|
-
import {
|
|
5
|
-
import { frDateToISO, hourShortToStartTime } from "../utils/
|
|
3
|
+
import { makeReunionUid } from "../utils/reunion_parsing";
|
|
4
|
+
import { norm } from "../utils/string_cleaning";
|
|
5
|
+
import { frDateToISO, hourShortToStartTime } from "../utils/date";
|
|
6
|
+
import { toCRDate } from "./util";
|
|
6
7
|
const PARA_h3_SEL = "p.sh_justify, p.sh_center, p.sh_marge, p[align], li, h3";
|
|
7
8
|
function findDayRoot($, targetISO) {
|
|
8
9
|
let $root = $();
|
|
@@ -208,7 +209,7 @@ export function extractDayH3Sections($, dateISO) {
|
|
|
208
209
|
export function parseCommissionCRSectionFromDom($, htmlFilePath, opts) {
|
|
209
210
|
try {
|
|
210
211
|
const { dateISO, hourShort, organe, section, matched } = opts;
|
|
211
|
-
const seanceRef = matched?.uid ??
|
|
212
|
+
const seanceRef = matched?.uid ?? makeReunionUid(dateISO, "COM", matched?.events[0].id ?? hourShort ?? "", organe ?? undefined);
|
|
212
213
|
const uid = seanceRef.replace(/^RU/, "CRC");
|
|
213
214
|
const dateSeance = toCRDate(dateISO, matched?.startTime ?? hourShortToStartTime(hourShort));
|
|
214
215
|
const $dayRoot = findDayRoot($, dateISO);
|
package/lib/model/seance.d.ts
CHANGED
|
@@ -1,9 +1,3 @@
|
|
|
1
|
-
import { CompteRendu
|
|
2
|
-
|
|
3
|
-
export declare function parseCompteRenduSlotFromFile(xmlFilePath: string, wantedSlot: TimeSlot, firstSlotOfDay?: TimeSlot): Promise<CompteRendu | null>;
|
|
1
|
+
import { CompteRendu } from "../types/compte_rendu";
|
|
2
|
+
export declare function parseCompteRenduIntervalFromFile(xmlFilePath: string, startIndex: number, endIndex: number, agendaEventId: string): Promise<CompteRendu | null>;
|
|
4
3
|
export declare function sessionStartYearFromDate(d: Date): number;
|
|
5
|
-
export declare function parseYYYYMMDD(yyyymmdd: string): Date | null;
|
|
6
|
-
export declare function deriveTitreObjetFromSommaire(sommaire: Sommaire | undefined, slot: TimeSlot): {
|
|
7
|
-
titre: string;
|
|
8
|
-
objet: string;
|
|
9
|
-
};
|
package/lib/model/seance.js
CHANGED
|
@@ -1,33 +1,29 @@
|
|
|
1
1
|
import fs from "fs";
|
|
2
2
|
import * as cheerio from "cheerio";
|
|
3
|
-
import
|
|
4
|
-
import {
|
|
5
|
-
import {
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
export async function parseCompteRenduSlotFromFile(xmlFilePath, wantedSlot, firstSlotOfDay) {
|
|
3
|
+
import { toCRDate } from "./util";
|
|
4
|
+
import { makeReunionUid } from "../utils/reunion_parsing";
|
|
5
|
+
import { yyyymmddFromPath } from "../utils/date";
|
|
6
|
+
import { decodeHtmlEntities, dedupeSpeaker, fixApostrophes, norm } from "../utils/string_cleaning";
|
|
7
|
+
export async function parseCompteRenduIntervalFromFile(xmlFilePath, startIndex, endIndex, agendaEventId) {
|
|
9
8
|
try {
|
|
10
9
|
const raw = fs.readFileSync(xmlFilePath, "utf8");
|
|
11
10
|
const $ = cheerio.load(raw, { xml: false });
|
|
12
11
|
const metadonnees = extractMetadonnees($, xmlFilePath);
|
|
13
12
|
const order = $("body *").toArray();
|
|
14
13
|
const idx = new Map(order.map((el, i) => [el, i]));
|
|
15
|
-
const
|
|
16
|
-
const
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
14
|
+
const totalNodes = order.length;
|
|
15
|
+
const clampedStart = Math.max(0, Math.min(startIndex, totalNodes - 1));
|
|
16
|
+
const clampedEnd = Math.max(0, Math.min(endIndex, totalNodes - 1));
|
|
17
|
+
const intervals = [
|
|
18
|
+
{
|
|
19
|
+
start: clampedStart,
|
|
20
|
+
end: clampedEnd,
|
|
21
|
+
},
|
|
22
|
+
];
|
|
21
23
|
metadonnees.sommaire = extractSommaireForIntervals($, idx, intervals);
|
|
22
24
|
const points = [];
|
|
23
25
|
let ordre = 0;
|
|
24
26
|
const addPoint = (p) => points.push({ ...p, ordre_absolu_seance: String(++ordre) });
|
|
25
|
-
// Titles removes because they are just listed at the top of the file and not linked to any ancre
|
|
26
|
-
// $("cri\\:titreS1 p.titre_S1").each((_, el) => {
|
|
27
|
-
// if (!elementInAnyInterval(el, idx, intervals)) return
|
|
28
|
-
// const t = normalizeTitle(norm($(el).text() || ""))
|
|
29
|
-
// if (t) addPoint({ code_grammaire: "TITRE_TEXTE_DISCUSSION", texte: { _: t }, code_style: "Titre" })
|
|
30
|
-
// })
|
|
31
27
|
// Interventions
|
|
32
28
|
$("div.intervenant").each((_, block) => {
|
|
33
29
|
if (!elementInAnyInterval(block, idx, intervals))
|
|
@@ -47,6 +43,8 @@ export async function parseCompteRenduSlotFromFile(xmlFilePath, wantedSlot, firs
|
|
|
47
43
|
].join(","))
|
|
48
44
|
.remove();
|
|
49
45
|
const firstP = $block.find("p").first();
|
|
46
|
+
if (!firstP || firstP.length === 0)
|
|
47
|
+
return;
|
|
50
48
|
const speakerLabelRaw = firstP.find(".orateur_nom").text() || firstP.find("a.lien_senfic").text() || "";
|
|
51
49
|
const speakerLabel = dedupeSpeaker(speakerLabelRaw);
|
|
52
50
|
const { mat, nom: nomCRI, qua: quaCRI } = readIntervenantMeta($block);
|
|
@@ -55,7 +53,8 @@ export async function parseCompteRenduSlotFromFile(xmlFilePath, wantedSlot, firs
|
|
|
55
53
|
const canonicalName = dedupeSpeaker(nomCRI || speakerLabel);
|
|
56
54
|
const role = roleForSpeaker(speakerLabel) || roleForSpeaker(qualite) || roleForSpeaker(quaCRI || "");
|
|
57
55
|
const speechHtml = sanitizeInterventionHtml($, $block);
|
|
58
|
-
|
|
56
|
+
const speechText = norm(cheerio.load(speechHtml).text() || "");
|
|
57
|
+
if (!speechText)
|
|
59
58
|
return;
|
|
60
59
|
addPoint({
|
|
61
60
|
code_grammaire: "PAROLE_GENERIQUE",
|
|
@@ -65,19 +64,25 @@ export async function parseCompteRenduSlotFromFile(xmlFilePath, wantedSlot, firs
|
|
|
65
64
|
});
|
|
66
65
|
});
|
|
67
66
|
const contenu = {
|
|
68
|
-
quantiemes: {
|
|
67
|
+
quantiemes: {
|
|
68
|
+
journee: metadonnees.dateSeance,
|
|
69
|
+
session: metadonnees.session,
|
|
70
|
+
},
|
|
69
71
|
point: points,
|
|
70
72
|
};
|
|
73
|
+
const yyyymmdd = yyyymmddFromPath(xmlFilePath);
|
|
74
|
+
const dateISO = `${yyyymmdd.slice(0, 4)}-${yyyymmdd.slice(4, 6)}-${yyyymmdd.slice(6, 8)}`;
|
|
75
|
+
const seanceRef = makeReunionUid(dateISO, "SP", agendaEventId, null);
|
|
71
76
|
return {
|
|
72
|
-
uid:
|
|
73
|
-
seanceRef
|
|
77
|
+
uid: `CRSSN${yyyymmdd}E${agendaEventId}`,
|
|
78
|
+
seanceRef,
|
|
74
79
|
sessionRef: metadonnees.session,
|
|
75
80
|
metadonnees,
|
|
76
81
|
contenu,
|
|
77
82
|
};
|
|
78
83
|
}
|
|
79
84
|
catch (e) {
|
|
80
|
-
console.error(`[CRI]
|
|
85
|
+
console.error(`[CRI] parseInterval error file=${xmlFilePath} interval=[${startIndex}..${endIndex}] event=${agendaEventId}:`, e);
|
|
81
86
|
return null;
|
|
82
87
|
}
|
|
83
88
|
}
|
|
@@ -87,96 +92,6 @@ export function sessionStartYearFromDate(d) {
|
|
|
87
92
|
const y = d.getFullYear();
|
|
88
93
|
return m >= 9 ? y : y - 1;
|
|
89
94
|
}
|
|
90
|
-
export function parseYYYYMMDD(yyyymmdd) {
|
|
91
|
-
if (!/^\d{8}$/.test(yyyymmdd))
|
|
92
|
-
return null;
|
|
93
|
-
const y = Number(yyyymmdd.slice(0, 4));
|
|
94
|
-
const m = Number(yyyymmdd.slice(4, 6)) - 1;
|
|
95
|
-
const d = Number(yyyymmdd.slice(6, 8));
|
|
96
|
-
const dt = new Date(y, m, d);
|
|
97
|
-
return Number.isFinite(dt.getTime()) ? dt : null;
|
|
98
|
-
}
|
|
99
|
-
export function deriveTitreObjetFromSommaire(sommaire, slot) {
|
|
100
|
-
const items = extractLevel1Items(sommaire);
|
|
101
|
-
const meaningful = items.filter((it) => !isBoilerplate(it.label));
|
|
102
|
-
if (meaningful.length === 0) {
|
|
103
|
-
return {
|
|
104
|
-
titre: `Séance publique ${slotLabel(slot)}`,
|
|
105
|
-
objet: "",
|
|
106
|
-
};
|
|
107
|
-
}
|
|
108
|
-
const titre = meaningful[0].label;
|
|
109
|
-
const objet = meaningful
|
|
110
|
-
.slice(0, 3)
|
|
111
|
-
.map((it) => it.label)
|
|
112
|
-
.join(" ; ");
|
|
113
|
-
return { titre, objet };
|
|
114
|
-
}
|
|
115
|
-
function slotLabel(slot) {
|
|
116
|
-
switch (slot) {
|
|
117
|
-
case "MATIN":
|
|
118
|
-
return "du matin";
|
|
119
|
-
case "APRES-MIDI":
|
|
120
|
-
return "de l’après-midi";
|
|
121
|
-
case "SOIR":
|
|
122
|
-
return "du soir";
|
|
123
|
-
default:
|
|
124
|
-
return "";
|
|
125
|
-
}
|
|
126
|
-
}
|
|
127
|
-
const BOILERPLATE_PATTERNS = [
|
|
128
|
-
/proc(?:è|e)s-?verbal/i,
|
|
129
|
-
/hommages?/i,
|
|
130
|
-
/désignation des vice-?président/i,
|
|
131
|
-
/candidatures? aux?/i,
|
|
132
|
-
/ordre du jour/i,
|
|
133
|
-
/rappels? au règlement/i,
|
|
134
|
-
/communications?/i,
|
|
135
|
-
/dépôts?/i,
|
|
136
|
-
/proclamation/i,
|
|
137
|
-
/présidence de/i,
|
|
138
|
-
/questions? diverses?/i,
|
|
139
|
-
/ouverture de la séance/i,
|
|
140
|
-
/clo(?:t|̂)ure de la séance/i,
|
|
141
|
-
];
|
|
142
|
-
const isBoilerplate = (label) => !label?.trim() || BOILERPLATE_PATTERNS.some((rx) => rx.test(label));
|
|
143
|
-
function extractLevel1Items(sommaire) {
|
|
144
|
-
const level1 = asArray(sommaire?.sommaire1);
|
|
145
|
-
return level1
|
|
146
|
-
.map((el) => ({
|
|
147
|
-
numero: toInt(el?.valeur_pts_odj),
|
|
148
|
-
label: String(el?.titreStruct?.intitule ?? "").trim(),
|
|
149
|
-
}))
|
|
150
|
-
.filter((it) => !!it.label)
|
|
151
|
-
.sort((a, b) => a.numero - b.numero);
|
|
152
|
-
}
|
|
153
|
-
function stripTrailingPunct(s) {
|
|
154
|
-
return s.replace(/\s*([:,.;])\s*$/u, "").trim();
|
|
155
|
-
}
|
|
156
|
-
function dedupeSpeaker(raw) {
|
|
157
|
-
let s = norm(raw);
|
|
158
|
-
s = stripTrailingPunct(s);
|
|
159
|
-
const dupPatterns = [/^(.+?)\s*[.]\s*\1$/u, /^(.+?)\s*,\s*\1,?$/u, /^(.+?)\s+\1$/u];
|
|
160
|
-
for (const re of dupPatterns) {
|
|
161
|
-
const m = s.match(re);
|
|
162
|
-
if (m) {
|
|
163
|
-
s = m[1];
|
|
164
|
-
break;
|
|
165
|
-
}
|
|
166
|
-
}
|
|
167
|
-
return s.replace(/\.\s*$/, "");
|
|
168
|
-
}
|
|
169
|
-
function fixApostrophes(s) {
|
|
170
|
-
let out = s;
|
|
171
|
-
out = out.replace(/\s*’\s*/g, "’");
|
|
172
|
-
out = out.replace(/\b([dljctmsn])\s*’/gi, (_, m) => m + "’");
|
|
173
|
-
out = out.replace(/’\s+([A-Za-zÀ-ÖØ-öø-ÿ])/g, "’$1");
|
|
174
|
-
out = out.replace(/\s+([,;:.!?])/g, "$1");
|
|
175
|
-
return out;
|
|
176
|
-
}
|
|
177
|
-
function normalizeTitle(text) {
|
|
178
|
-
return text.replace(/^PR[ÉE]SIDENCE DE\b/i, "Présidence de ");
|
|
179
|
-
}
|
|
180
95
|
function roleForSpeaker(labelOrQualite) {
|
|
181
96
|
const s = (labelOrQualite || "").toLowerCase();
|
|
182
97
|
if (/^(m\.|mme)?\s*(le|la)\s+pr[ée]sident(e)?\b/.test(s) || /\bpr[ée]sident[e]?\s+de\s+séance\b/.test(s))
|
package/lib/model/util.d.ts
CHANGED
|
@@ -6,8 +6,4 @@ export declare function removeSubstring(expr: Expression<string | null | undefin
|
|
|
6
6
|
export declare function replace(expr: Expression<string | null | undefined>, pattern: Expression<string>, replacement: Expression<string>): import("kysely").RawBuilder<string>;
|
|
7
7
|
export declare function rtrim(expr: Expression<string | null | undefined>): import("kysely").RawBuilder<string>;
|
|
8
8
|
export declare function toDateString(expr: Expression<Date | null | undefined>, format?: Expression<string>): import("kysely").RawBuilder<string>;
|
|
9
|
-
export declare function norm(s?: string | null): string;
|
|
10
9
|
export declare function toCRDate(dateISO: string, startTime?: string | null): string;
|
|
11
|
-
export declare function normalizeTitle(t: string): string;
|
|
12
|
-
export declare function jaccardTokenSim(a: string, b: string): number;
|
|
13
|
-
export declare function decodeHtmlEntities(s?: string | null): string;
|
package/lib/model/util.js
CHANGED
|
@@ -21,12 +21,6 @@ export function rtrim(expr) {
|
|
|
21
21
|
export function toDateString(expr, format = sql.val(STANDARD_DATE_FORMAT)) {
|
|
22
22
|
return sql `to_char(${expr}, ${format})`;
|
|
23
23
|
}
|
|
24
|
-
export function norm(s) {
|
|
25
|
-
return (s || "")
|
|
26
|
-
.replace(/\u00A0/g, " ")
|
|
27
|
-
.replace(/\s+/g, " ")
|
|
28
|
-
.trim();
|
|
29
|
-
}
|
|
30
24
|
export function toCRDate(dateISO, startTime) {
|
|
31
25
|
const yyyymmdd = dateISO.replace(/-/g, ""); // "20250716"
|
|
32
26
|
let hh = "00", mm = "00", ss = "00", SSS = "000";
|
|
@@ -42,35 +36,3 @@ export function toCRDate(dateISO, startTime) {
|
|
|
42
36
|
}
|
|
43
37
|
return `${yyyymmdd}${hh}${mm}${ss}${SSS}`;
|
|
44
38
|
}
|
|
45
|
-
export function normalizeTitle(t) {
|
|
46
|
-
return (t || "")
|
|
47
|
-
.toLowerCase()
|
|
48
|
-
.normalize("NFD")
|
|
49
|
-
.replace(/\p{Diacritic}/gu, "")
|
|
50
|
-
.replace(/[^a-z0-9\s]/g, " ")
|
|
51
|
-
.replace(/\s+/g, " ")
|
|
52
|
-
.trim();
|
|
53
|
-
}
|
|
54
|
-
export function jaccardTokenSim(a, b) {
|
|
55
|
-
const A = new Set(normalizeTitle(a).split(" ").filter(Boolean));
|
|
56
|
-
const B = new Set(normalizeTitle(b).split(" ").filter(Boolean));
|
|
57
|
-
if (A.size === 0 || B.size === 0)
|
|
58
|
-
return 0;
|
|
59
|
-
let inter = 0;
|
|
60
|
-
for (const x of A)
|
|
61
|
-
if (B.has(x))
|
|
62
|
-
inter++;
|
|
63
|
-
return inter / (A.size + B.size - inter);
|
|
64
|
-
}
|
|
65
|
-
export function decodeHtmlEntities(s) {
|
|
66
|
-
if (!s)
|
|
67
|
-
return "";
|
|
68
|
-
return s
|
|
69
|
-
.replace(/&#x([0-9a-fA-F]+);/g, (_, h) => String.fromCodePoint(parseInt(h, 16)))
|
|
70
|
-
.replace(/&#(\d+);/g, (_, d) => String.fromCodePoint(parseInt(d, 10)))
|
|
71
|
-
.replace(/&/g, "&")
|
|
72
|
-
.replace(/</g, "<")
|
|
73
|
-
.replace(/>/g, ">")
|
|
74
|
-
.replace(/"/g, '"')
|
|
75
|
-
.replace(/'/g, "'");
|
|
76
|
-
}
|
|
@@ -9,7 +9,7 @@ import { getSessionsFromStart } from "../types/sessions";
|
|
|
9
9
|
import { ID_DATE_FORMAT } from "./datautil";
|
|
10
10
|
import { commonOptions } from "./shared/cli_helpers";
|
|
11
11
|
import { fetchWithRetry } from "./shared/util";
|
|
12
|
-
import {
|
|
12
|
+
import { buildReunionsByBucket } from "../utils/reunion_parsing";
|
|
13
13
|
import { buildSenatDossierIndex } from "../utils/reunion_odj_building";
|
|
14
14
|
const optionsDefinitions = [
|
|
15
15
|
...commonOptions,
|
|
@@ -103,25 +103,14 @@ async function parseAgenda(transformedAgendaSessionDir, agendaFileName, agendaPa
|
|
|
103
103
|
return;
|
|
104
104
|
const flatPath = path.join(transformedAgendaSessionDir, `${agendaFileName}.json`);
|
|
105
105
|
fs.writeJSONSync(flatPath, parsedAgendaEvents, { spaces: 2 });
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
// b) (reco) trier pour stabilité, comme pour les NON-SP
|
|
111
|
-
const PARIS = "Europe/Paris";
|
|
112
|
-
spGroups.sort((a, b) => {
|
|
113
|
-
const da = DateTime.fromISO(`${a.date}T${a.startTime || "00:00:00.000+02:00"}`, { zone: PARIS }).toMillis();
|
|
114
|
-
const db = DateTime.fromISO(`${b.date}T${b.startTime || "00:00:00.000+02:00"}`, { zone: PARIS }).toMillis();
|
|
115
|
-
// en cas d’égalité, ordre par slot pour stabilité
|
|
116
|
-
return da - db || (a.slot || "UNKNOWN").localeCompare(b.slot || "UNKNOWN");
|
|
117
|
-
});
|
|
118
|
-
if (spGroups.length > 0) {
|
|
119
|
-
writeGroupsAsFiles(transformedAgendaSessionDir, spGroups);
|
|
106
|
+
const byBucket = buildReunionsByBucket(parsedAgendaEvents, dossierBySenatUrl);
|
|
107
|
+
// SP
|
|
108
|
+
if (byBucket.IDS.length > 0) {
|
|
109
|
+
writeGroupsAsFiles(transformedAgendaSessionDir, byBucket.IDS);
|
|
120
110
|
}
|
|
121
|
-
//
|
|
122
|
-
const groupedBySuffix = groupNonSPByTypeOrganeHour(parsedAgendaEvents, dossierBySenatUrl);
|
|
111
|
+
// NON-SP
|
|
123
112
|
for (const suffix of ["IDC", "IDM", "IDO", "IDI"]) {
|
|
124
|
-
const groups =
|
|
113
|
+
const groups = byBucket[suffix];
|
|
125
114
|
if (groups.length > 0) {
|
|
126
115
|
writeGroupsAsFiles(transformedAgendaSessionDir, groups);
|
|
127
116
|
}
|
|
@@ -10,7 +10,7 @@ import { commonOptions } from "./shared/cli_helpers";
|
|
|
10
10
|
import { sessionStartYearFromDate } from "../model/seance";
|
|
11
11
|
import { getSessionsFromStart } from "../types/sessions";
|
|
12
12
|
import { ensureAndClearDir, fetchWithRetry } from "./shared/util";
|
|
13
|
-
import { jaccardTokenSim } from "../
|
|
13
|
+
import { jaccard, jaccardTokenSim } from "../utils/scoring";
|
|
14
14
|
class CommissionCRDownloadError extends Error {
|
|
15
15
|
constructor(message, url) {
|
|
16
16
|
super(`An error occurred while retrieving Commission CR ${url}: ${message}`);
|
|
@@ -138,15 +138,6 @@ function toTokens(s) {
|
|
|
138
138
|
.split(/\s+/)
|
|
139
139
|
.filter((t) => t.length >= 3 && !["commission", "des", "de", "du", "d", "la", "le", "les", "et"].includes(t)));
|
|
140
140
|
}
|
|
141
|
-
function jaccard(a, b) {
|
|
142
|
-
if (!a.size || !b.size)
|
|
143
|
-
return 0;
|
|
144
|
-
let inter = 0;
|
|
145
|
-
for (const t of a)
|
|
146
|
-
if (b.has(t))
|
|
147
|
-
inter++;
|
|
148
|
-
return inter / (a.size + b.size - inter);
|
|
149
|
-
}
|
|
150
141
|
function reunionOrganeCandidates(h) {
|
|
151
142
|
const any = h;
|
|
152
143
|
const out = [any.organeSlug, any.organeKey, any.organe, h.titre].filter(Boolean);
|