@tricoteuses/senat 2.16.2 → 2.16.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,6 +5,7 @@ export declare function getRemainingTextAfterSpeakerHeader($: cheerio.CheerioAPI
5
5
  export type DaySection = {
6
6
  title: string;
7
7
  $start: cheerio.Cheerio<any>;
8
+ time?: string;
8
9
  };
9
10
  export declare function cleanTitle(t: string): string;
10
11
  export declare function extractDayH3Sections($: cheerio.CheerioAPI, dateISO: string): DaySection[];
@@ -161,33 +161,48 @@ function buildPointsFromParagraphs($, paras) {
161
161
  flush();
162
162
  return points;
163
163
  }
164
+ const TIME_RE = /(?:\b[àa]\s*)?(\d{1,2})\s*(?:h|heures?)\s*(?:([0-5]\d))?/i;
164
165
  export function cleanTitle(t) {
165
166
  return (t || "").replace(/\s+/g, " ").trim();
166
167
  }
168
+ function parseTimeToHHmm(text) {
169
+ const m = normalizeSpaces(text).match(TIME_RE);
170
+ if (!m)
171
+ return undefined;
172
+ const hh = m[1]?.padStart(2, "0");
173
+ const mm = (m[2] ?? "00").padStart(2, "0");
174
+ const h = Number(hh);
175
+ if (h >= 0 && h <= 23)
176
+ return `${hh}:${mm}`;
177
+ return undefined;
178
+ }
179
+ function findNearbyTime($, $h3) {
180
+ let cur = $h3.prev();
181
+ for (let i = 0; i < 3 && cur.length; i++, cur = cur.prev()) {
182
+ const direct = parseTimeToHHmm(cur.text());
183
+ if (direct)
184
+ return direct;
185
+ const italic = parseTimeToHHmm(cur.find("i, em").first().text());
186
+ if (italic)
187
+ return italic;
188
+ }
189
+ return undefined;
190
+ }
167
191
  export function extractDayH3Sections($, dateISO) {
168
192
  const sections = [];
169
193
  const $dayRoot = findDayRoot($, dateISO);
170
194
  if ($dayRoot.length === 0)
171
195
  return sections;
172
- let $cursor = $dayRoot.next();
173
- while ($cursor.length && !$cursor.is("h2")) {
174
- if ($cursor.is("h3")) {
175
- const title = cleanTitle($cursor.text());
176
- if (title)
177
- sections.push({ title, $start: $cursor });
178
- $cursor = $cursor.next();
179
- continue;
180
- }
181
- const $h3 = $cursor.find("h3").first();
182
- if ($h3.length) {
183
- const title = cleanTitle($h3.text());
184
- if (title)
185
- sections.push({ title, $start: $h3 });
186
- $cursor = $cursor.next();
187
- continue;
188
- }
189
- $cursor = $cursor.next();
190
- }
196
+ const $range = $dayRoot.nextUntil("h2");
197
+ const $h3s = $range.filter("h3").add($range.find("h3"));
198
+ $h3s.each((_, el) => {
199
+ const $h3 = $(el);
200
+ const title = cleanTitle($h3.text());
201
+ if (!title)
202
+ return;
203
+ const time = findNearbyTime($, $h3);
204
+ sections.push({ title, $start: $h3, time });
205
+ });
191
206
  return sections;
192
207
  }
193
208
  export function parseCommissionCRSectionFromDom($, htmlFilePath, opts) {
@@ -237,16 +237,31 @@ function extractAndRemoveLeadingQualite($, $block) {
237
237
  return fixApostrophes(norm(parts.join(" ")));
238
238
  }
239
239
  function sanitizeInterventionHtml($, $block) {
240
- const $clone = $block.clone();
241
- $clone.find("a[name]").remove();
242
- $clone.find('div[align="right"]').remove();
243
- $clone.find("a.link").remove();
244
- $clone.find("img").remove();
245
- $clone.find("a#ameli_amendement_cri_phrase, a#ameli_amendement_cra_contenu, a#ameli_amendement_cra_objet").remove();
246
- $clone.find(".orateur_nom, .orateur_qualite").remove();
247
- let html = $clone.html() || "";
248
- html = html.replace(/<!--[\s\S]*?-->/g, "");
249
- return html.trim();
240
+ const ps = $block.find("p").toArray();
241
+ const cleaned = ps
242
+ .map((p) => {
243
+ const $p = $(p).clone();
244
+ $p.find(".orateur_nom, .orateur_qualite").remove();
245
+ $p.find("a").each((_, a) => {
246
+ const $a = $(a);
247
+ $a.replaceWith($a.text());
248
+ });
249
+ $p.find(".info_entre_parentheses").each((_, el) => {
250
+ const txt = $(el).text();
251
+ $(el).replaceWith($("<em/>").text(txt));
252
+ });
253
+ $p.find("span").each((_, span) => {
254
+ const $s = $(span);
255
+ if (!$s.text().trim())
256
+ $s.remove();
257
+ });
258
+ const inner = ($p.html() || "").trim();
259
+ if (!inner)
260
+ return null;
261
+ return `<p>${inner}</p>`;
262
+ })
263
+ .filter(Boolean);
264
+ return cleaned.join("<br/>");
250
265
  }
251
266
  function extractSommaireForIntervals($, idx, intervals) {
252
267
  const inIv = (el) => elementInAnyInterval(el, idx, intervals);
@@ -9,7 +9,7 @@ import { createActesLegislatifs } from "../model/dosleg";
9
9
  import { UNDEFINED_SESSION } from "../types/sessions";
10
10
  import { getSessionFromDate, getSessionFromSignet } from "./datautil";
11
11
  import { commonOptions } from "./shared/cli_helpers";
12
- import { ensureAndClearDir } from "./shared/util";
12
+ import { ensureAndClearDir, ensureAndClearDirSync } from "./shared/util";
13
13
  const optionsDefinitions = [...commonOptions];
14
14
  const options = commandLineArgs(optionsDefinitions);
15
15
  const SENAT_TEXTE_XML_BASE_URL = "https://www.senat.fr/akomantoso/";
@@ -47,7 +47,7 @@ async function convertDatasetAmeli(dataDir, options) {
47
47
  console.log(`Converting database ${dataset.database} data into files…`);
48
48
  }
49
49
  const ameliReorganizedRootDir = path.join(dataDir, dataset.database);
50
- ensureAndClearDir(ameliReorganizedRootDir);
50
+ ensureAndClearDirSync(ameliReorganizedRootDir);
51
51
  for await (const amendement of findAllAmendements(options["fromSession"])) {
52
52
  if (options["verbose"]) {
53
53
  console.log(`Converting ${amendement["numero"]} file…`);
@@ -88,18 +88,20 @@ async function enrichDebat(debat, auteurs) {
88
88
  const enrichedDebat = { ...debat };
89
89
  for (const section of enrichedDebat.sections) {
90
90
  for (const intervention of section.interventions) {
91
+ ;
91
92
  intervention.auteur = findAuteur(intervention["auteur_code"], auteurs);
92
93
  }
93
94
  }
94
95
  for (const section of enrichedDebat.sections_divers) {
95
96
  for (const intervention of section.interventions) {
97
+ ;
96
98
  intervention.auteur = findAuteur(intervention["auteur_code"], auteurs);
97
99
  }
98
100
  }
99
101
  return enrichedDebat;
100
102
  }
101
103
  function findAuteur(auteurCode, auteurs) {
102
- return auteurs.find(auteur => auteur.code === auteurCode);
104
+ return auteurs.find((auteur) => auteur.code === auteurCode);
103
105
  }
104
106
  async function convertDatasetDosLeg(dataDir, options) {
105
107
  const dataset = datasets.dosleg;
@@ -281,9 +281,9 @@ async function retrieveCommissionCRs(options = {}) {
281
281
  const MAX_TIME_DELTA_MIN = 120;
282
282
  const ORGANE_GATE = 0.55;
283
283
  const TITLE_GATE = 0.2;
284
- const W_ORG = 0.5;
285
- const W_TIM = 0.2;
286
- const W_TIT = 0.3;
284
+ const W_ORG = 0.4;
285
+ const W_TIM = 0.4;
286
+ const W_TIT = 0.2;
287
287
  for (let sIdx = 0; sIdx < sections.length; sIdx++) {
288
288
  const sec = sections[sIdx];
289
289
  let best = null;
@@ -292,7 +292,7 @@ async function retrieveCommissionCRs(options = {}) {
292
292
  const scored = hits
293
293
  .map((h) => {
294
294
  const sOrg = organeSimilarity(h, commissionKey); // 0..1
295
- const sTim = timeProximityScore(h, day.openTime ?? null, MAX_TIME_DELTA_MIN); // 0..1
295
+ const sTim = timeProximityScore(h, sec.time ?? day.openTime ?? null, MAX_TIME_DELTA_MIN); // 0..1
296
296
  const sTit = titleSimilarity(h, sec.title); // 0..1
297
297
  const total = W_ORG * sOrg + W_TIM * sTim + W_TIT * sTit;
298
298
  return { h, sOrg, sTim, sTit, total };
@@ -9,14 +9,13 @@ import { getSessionsFromStart } from "../types/sessions";
9
9
  import { commonOptions } from "./shared/cli_helpers";
10
10
  import { decodeHtmlEntities } from "../model/util";
11
11
  // ===================== Constants =====================
12
- const MATCH_THRESHOLD = 0.56;
12
+ const MATCH_THRESHOLD = 0.5;
13
13
  const MAX_CANDIDATES = 15;
14
14
  const MAX_PAGES = 3;
15
15
  const STATS = { total: 0, accepted: 0 };
16
16
  const VIDEOS_ROOT_FOLDER = "videos";
17
17
  const SENAT_VIDEOS_SEARCH_AJAX = "https://videos.senat.fr/senat_videos_search.php";
18
18
  const SENAT_DATAS_ROOT = "https://videos.senat.fr/Datas/senat";
19
- const SENAT_VOD_HOST = "https://vodsenat.akamaized.net";
20
19
  // ===================== CLI =====================
21
20
  const optionsDefinitions = [...commonOptions];
22
21
  const options = commandLineArgs(optionsDefinitions);
@@ -219,9 +218,9 @@ function score(agenda, agendaTs, videoTitle, videoEpoch, videoOrgane) {
219
218
  }
220
219
  const orgScore = videoOrgane && agenda.organe ? dice(agenda.organe, videoOrgane) : 0;
221
220
  if (orgScore === 0 && agenda.organe === "Séance publique") {
222
- return 0.5 * titleScore + 0.5 * timeScore;
221
+ return 0.3 * titleScore + 0.7 * timeScore;
223
222
  }
224
- return 0.4 * titleScore + 0.3 * timeScore + orgScore * 0.3;
223
+ return 0.2 * titleScore + 0.4 * timeScore + orgScore * 0.4;
225
224
  }
226
225
  /**
227
226
  * Build search strategies for senat's videos
@@ -1,3 +1,4 @@
1
1
  export declare function isOptionEmptyOrHasValue(option: string, value: string): boolean;
2
+ export declare function ensureAndClearDirSync(dir: string): void;
2
3
  export declare function ensureAndClearDir(path: string): void;
3
4
  export declare function fetchWithRetry(url: string, retries?: number, backoff?: number): Promise<Response>;
@@ -1,7 +1,14 @@
1
1
  import fs from "fs-extra";
2
+ import path from "path";
2
3
  export function isOptionEmptyOrHasValue(option, value) {
3
4
  return !option || option.length === 0 || option.includes(value);
4
5
  }
6
+ export function ensureAndClearDirSync(dir) {
7
+ fs.ensureDirSync(dir);
8
+ for (const name of fs.readdirSync(dir)) {
9
+ fs.rmSync(path.join(dir, name), { recursive: true, force: true });
10
+ }
11
+ }
5
12
  export function ensureAndClearDir(path) {
6
13
  if (!fs.existsSync(path)) {
7
14
  fs.mkdirSync(path, { recursive: true });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tricoteuses/senat",
3
- "version": "2.16.2",
3
+ "version": "2.16.3",
4
4
  "description": "Handle French Sénat's open data",
5
5
  "keywords": [
6
6
  "France",