@tricoteuses/senat 2.16.2 → 2.16.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/model/commission.d.ts +1 -0
- package/lib/model/commission.js +34 -19
- package/lib/model/seance.js +25 -10
- package/lib/scripts/convert_data.js +5 -3
- package/lib/scripts/retrieve_cr_commission.js +4 -4
- package/lib/scripts/retrieve_videos.js +3 -4
- package/lib/scripts/shared/util.d.ts +1 -0
- package/lib/scripts/shared/util.js +7 -0
- package/package.json +1 -1
|
@@ -5,6 +5,7 @@ export declare function getRemainingTextAfterSpeakerHeader($: cheerio.CheerioAPI
|
|
|
5
5
|
export type DaySection = {
|
|
6
6
|
title: string;
|
|
7
7
|
$start: cheerio.Cheerio<any>;
|
|
8
|
+
time?: string;
|
|
8
9
|
};
|
|
9
10
|
export declare function cleanTitle(t: string): string;
|
|
10
11
|
export declare function extractDayH3Sections($: cheerio.CheerioAPI, dateISO: string): DaySection[];
|
package/lib/model/commission.js
CHANGED
|
@@ -161,33 +161,48 @@ function buildPointsFromParagraphs($, paras) {
|
|
|
161
161
|
flush();
|
|
162
162
|
return points;
|
|
163
163
|
}
|
|
164
|
+
const TIME_RE = /(?:\b[àa]\s*)?(\d{1,2})\s*(?:h|heures?)\s*(?:([0-5]\d))?/i;
|
|
164
165
|
export function cleanTitle(t) {
|
|
165
166
|
return (t || "").replace(/\s+/g, " ").trim();
|
|
166
167
|
}
|
|
168
|
+
function parseTimeToHHmm(text) {
|
|
169
|
+
const m = normalizeSpaces(text).match(TIME_RE);
|
|
170
|
+
if (!m)
|
|
171
|
+
return undefined;
|
|
172
|
+
const hh = m[1]?.padStart(2, "0");
|
|
173
|
+
const mm = (m[2] ?? "00").padStart(2, "0");
|
|
174
|
+
const h = Number(hh);
|
|
175
|
+
if (h >= 0 && h <= 23)
|
|
176
|
+
return `${hh}:${mm}`;
|
|
177
|
+
return undefined;
|
|
178
|
+
}
|
|
179
|
+
function findNearbyTime($, $h3) {
|
|
180
|
+
let cur = $h3.prev();
|
|
181
|
+
for (let i = 0; i < 3 && cur.length; i++, cur = cur.prev()) {
|
|
182
|
+
const direct = parseTimeToHHmm(cur.text());
|
|
183
|
+
if (direct)
|
|
184
|
+
return direct;
|
|
185
|
+
const italic = parseTimeToHHmm(cur.find("i, em").first().text());
|
|
186
|
+
if (italic)
|
|
187
|
+
return italic;
|
|
188
|
+
}
|
|
189
|
+
return undefined;
|
|
190
|
+
}
|
|
167
191
|
export function extractDayH3Sections($, dateISO) {
|
|
168
192
|
const sections = [];
|
|
169
193
|
const $dayRoot = findDayRoot($, dateISO);
|
|
170
194
|
if ($dayRoot.length === 0)
|
|
171
195
|
return sections;
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
}
|
|
181
|
-
|
|
182
|
-
if ($h3.length) {
|
|
183
|
-
const title = cleanTitle($h3.text());
|
|
184
|
-
if (title)
|
|
185
|
-
sections.push({ title, $start: $h3 });
|
|
186
|
-
$cursor = $cursor.next();
|
|
187
|
-
continue;
|
|
188
|
-
}
|
|
189
|
-
$cursor = $cursor.next();
|
|
190
|
-
}
|
|
196
|
+
const $range = $dayRoot.nextUntil("h2");
|
|
197
|
+
const $h3s = $range.filter("h3").add($range.find("h3"));
|
|
198
|
+
$h3s.each((_, el) => {
|
|
199
|
+
const $h3 = $(el);
|
|
200
|
+
const title = cleanTitle($h3.text());
|
|
201
|
+
if (!title)
|
|
202
|
+
return;
|
|
203
|
+
const time = findNearbyTime($, $h3);
|
|
204
|
+
sections.push({ title, $start: $h3, time });
|
|
205
|
+
});
|
|
191
206
|
return sections;
|
|
192
207
|
}
|
|
193
208
|
export function parseCommissionCRSectionFromDom($, htmlFilePath, opts) {
|
package/lib/model/seance.js
CHANGED
|
@@ -237,16 +237,31 @@ function extractAndRemoveLeadingQualite($, $block) {
|
|
|
237
237
|
return fixApostrophes(norm(parts.join(" ")));
|
|
238
238
|
}
|
|
239
239
|
function sanitizeInterventionHtml($, $block) {
|
|
240
|
-
const
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
240
|
+
const ps = $block.find("p").toArray();
|
|
241
|
+
const cleaned = ps
|
|
242
|
+
.map((p) => {
|
|
243
|
+
const $p = $(p).clone();
|
|
244
|
+
$p.find(".orateur_nom, .orateur_qualite").remove();
|
|
245
|
+
$p.find("a").each((_, a) => {
|
|
246
|
+
const $a = $(a);
|
|
247
|
+
$a.replaceWith($a.text());
|
|
248
|
+
});
|
|
249
|
+
$p.find(".info_entre_parentheses").each((_, el) => {
|
|
250
|
+
const txt = $(el).text();
|
|
251
|
+
$(el).replaceWith($("<em/>").text(txt));
|
|
252
|
+
});
|
|
253
|
+
$p.find("span").each((_, span) => {
|
|
254
|
+
const $s = $(span);
|
|
255
|
+
if (!$s.text().trim())
|
|
256
|
+
$s.remove();
|
|
257
|
+
});
|
|
258
|
+
const inner = ($p.html() || "").trim();
|
|
259
|
+
if (!inner)
|
|
260
|
+
return null;
|
|
261
|
+
return `<p>${inner}</p>`;
|
|
262
|
+
})
|
|
263
|
+
.filter(Boolean);
|
|
264
|
+
return cleaned.join("<br/>");
|
|
250
265
|
}
|
|
251
266
|
function extractSommaireForIntervals($, idx, intervals) {
|
|
252
267
|
const inIv = (el) => elementInAnyInterval(el, idx, intervals);
|
|
@@ -9,7 +9,7 @@ import { createActesLegislatifs } from "../model/dosleg";
|
|
|
9
9
|
import { UNDEFINED_SESSION } from "../types/sessions";
|
|
10
10
|
import { getSessionFromDate, getSessionFromSignet } from "./datautil";
|
|
11
11
|
import { commonOptions } from "./shared/cli_helpers";
|
|
12
|
-
import { ensureAndClearDir } from "./shared/util";
|
|
12
|
+
import { ensureAndClearDir, ensureAndClearDirSync } from "./shared/util";
|
|
13
13
|
const optionsDefinitions = [...commonOptions];
|
|
14
14
|
const options = commandLineArgs(optionsDefinitions);
|
|
15
15
|
const SENAT_TEXTE_XML_BASE_URL = "https://www.senat.fr/akomantoso/";
|
|
@@ -47,7 +47,7 @@ async function convertDatasetAmeli(dataDir, options) {
|
|
|
47
47
|
console.log(`Converting database ${dataset.database} data into files…`);
|
|
48
48
|
}
|
|
49
49
|
const ameliReorganizedRootDir = path.join(dataDir, dataset.database);
|
|
50
|
-
|
|
50
|
+
ensureAndClearDirSync(ameliReorganizedRootDir);
|
|
51
51
|
for await (const amendement of findAllAmendements(options["fromSession"])) {
|
|
52
52
|
if (options["verbose"]) {
|
|
53
53
|
console.log(`Converting ${amendement["numero"]} file…`);
|
|
@@ -88,18 +88,20 @@ async function enrichDebat(debat, auteurs) {
|
|
|
88
88
|
const enrichedDebat = { ...debat };
|
|
89
89
|
for (const section of enrichedDebat.sections) {
|
|
90
90
|
for (const intervention of section.interventions) {
|
|
91
|
+
;
|
|
91
92
|
intervention.auteur = findAuteur(intervention["auteur_code"], auteurs);
|
|
92
93
|
}
|
|
93
94
|
}
|
|
94
95
|
for (const section of enrichedDebat.sections_divers) {
|
|
95
96
|
for (const intervention of section.interventions) {
|
|
97
|
+
;
|
|
96
98
|
intervention.auteur = findAuteur(intervention["auteur_code"], auteurs);
|
|
97
99
|
}
|
|
98
100
|
}
|
|
99
101
|
return enrichedDebat;
|
|
100
102
|
}
|
|
101
103
|
function findAuteur(auteurCode, auteurs) {
|
|
102
|
-
return auteurs.find(auteur => auteur.code === auteurCode);
|
|
104
|
+
return auteurs.find((auteur) => auteur.code === auteurCode);
|
|
103
105
|
}
|
|
104
106
|
async function convertDatasetDosLeg(dataDir, options) {
|
|
105
107
|
const dataset = datasets.dosleg;
|
|
@@ -281,9 +281,9 @@ async function retrieveCommissionCRs(options = {}) {
|
|
|
281
281
|
const MAX_TIME_DELTA_MIN = 120;
|
|
282
282
|
const ORGANE_GATE = 0.55;
|
|
283
283
|
const TITLE_GATE = 0.2;
|
|
284
|
-
const W_ORG = 0.
|
|
285
|
-
const W_TIM = 0.
|
|
286
|
-
const W_TIT = 0.
|
|
284
|
+
const W_ORG = 0.4;
|
|
285
|
+
const W_TIM = 0.4;
|
|
286
|
+
const W_TIT = 0.2;
|
|
287
287
|
for (let sIdx = 0; sIdx < sections.length; sIdx++) {
|
|
288
288
|
const sec = sections[sIdx];
|
|
289
289
|
let best = null;
|
|
@@ -292,7 +292,7 @@ async function retrieveCommissionCRs(options = {}) {
|
|
|
292
292
|
const scored = hits
|
|
293
293
|
.map((h) => {
|
|
294
294
|
const sOrg = organeSimilarity(h, commissionKey); // 0..1
|
|
295
|
-
const sTim = timeProximityScore(h, day.openTime ?? null, MAX_TIME_DELTA_MIN); // 0..1
|
|
295
|
+
const sTim = timeProximityScore(h, sec.time ?? day.openTime ?? null, MAX_TIME_DELTA_MIN); // 0..1
|
|
296
296
|
const sTit = titleSimilarity(h, sec.title); // 0..1
|
|
297
297
|
const total = W_ORG * sOrg + W_TIM * sTim + W_TIT * sTit;
|
|
298
298
|
return { h, sOrg, sTim, sTit, total };
|
|
@@ -9,14 +9,13 @@ import { getSessionsFromStart } from "../types/sessions";
|
|
|
9
9
|
import { commonOptions } from "./shared/cli_helpers";
|
|
10
10
|
import { decodeHtmlEntities } from "../model/util";
|
|
11
11
|
// ===================== Constants =====================
|
|
12
|
-
const MATCH_THRESHOLD = 0.
|
|
12
|
+
const MATCH_THRESHOLD = 0.5;
|
|
13
13
|
const MAX_CANDIDATES = 15;
|
|
14
14
|
const MAX_PAGES = 3;
|
|
15
15
|
const STATS = { total: 0, accepted: 0 };
|
|
16
16
|
const VIDEOS_ROOT_FOLDER = "videos";
|
|
17
17
|
const SENAT_VIDEOS_SEARCH_AJAX = "https://videos.senat.fr/senat_videos_search.php";
|
|
18
18
|
const SENAT_DATAS_ROOT = "https://videos.senat.fr/Datas/senat";
|
|
19
|
-
const SENAT_VOD_HOST = "https://vodsenat.akamaized.net";
|
|
20
19
|
// ===================== CLI =====================
|
|
21
20
|
const optionsDefinitions = [...commonOptions];
|
|
22
21
|
const options = commandLineArgs(optionsDefinitions);
|
|
@@ -219,9 +218,9 @@ function score(agenda, agendaTs, videoTitle, videoEpoch, videoOrgane) {
|
|
|
219
218
|
}
|
|
220
219
|
const orgScore = videoOrgane && agenda.organe ? dice(agenda.organe, videoOrgane) : 0;
|
|
221
220
|
if (orgScore === 0 && agenda.organe === "Séance publique") {
|
|
222
|
-
return 0.
|
|
221
|
+
return 0.3 * titleScore + 0.7 * timeScore;
|
|
223
222
|
}
|
|
224
|
-
return 0.
|
|
223
|
+
return 0.2 * titleScore + 0.4 * timeScore + orgScore * 0.4;
|
|
225
224
|
}
|
|
226
225
|
/**
|
|
227
226
|
* Build search strategies for senat's videos
|
|
@@ -1,3 +1,4 @@
|
|
|
1
1
|
export declare function isOptionEmptyOrHasValue(option: string, value: string): boolean;
|
|
2
|
+
export declare function ensureAndClearDirSync(dir: string): void;
|
|
2
3
|
export declare function ensureAndClearDir(path: string): void;
|
|
3
4
|
export declare function fetchWithRetry(url: string, retries?: number, backoff?: number): Promise<Response>;
|
|
@@ -1,7 +1,14 @@
|
|
|
1
1
|
import fs from "fs-extra";
|
|
2
|
+
import path from "path";
|
|
2
3
|
export function isOptionEmptyOrHasValue(option, value) {
|
|
3
4
|
return !option || option.length === 0 || option.includes(value);
|
|
4
5
|
}
|
|
6
|
+
export function ensureAndClearDirSync(dir) {
|
|
7
|
+
fs.ensureDirSync(dir);
|
|
8
|
+
for (const name of fs.readdirSync(dir)) {
|
|
9
|
+
fs.rmSync(path.join(dir, name), { recursive: true, force: true });
|
|
10
|
+
}
|
|
11
|
+
}
|
|
5
12
|
export function ensureAndClearDir(path) {
|
|
6
13
|
if (!fs.existsSync(path)) {
|
|
7
14
|
fs.mkdirSync(path, { recursive: true });
|