@tricoteuses/senat 2.20.35 → 2.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,9 +12,17 @@ export declare const SENAT_DATAS_ROOT = "https://videos.senat.fr/Datas/senat";
12
12
  export declare function fetchText(url: string): Promise<string | null>;
13
13
  export declare function fetchBuffer(url: string): Promise<Buffer | null>;
14
14
  export declare function queryString(obj: Record<string, string>): string;
15
+ export declare function parseDataNvs(nvs: string): {
16
+ epoch?: number;
17
+ organes: string[];
18
+ firstChapterLabel?: string;
19
+ salle?: string;
20
+ };
21
+ export declare function buildSenatVodMasterM3u8FromNvs(nvsText: string): string | null;
15
22
  export declare function isAmbiguousTimeOriginal(timeOriginal?: string | null): boolean;
16
23
  export declare function getAgendaType(agenda: Reunion): string;
17
24
  export declare function fetchAllSearchPages(args: SearchParams, maxPages?: number): Promise<string[]>;
25
+ export declare function getOrgKey(norm: string): string;
18
26
  export type MatchResult = {
19
27
  reunionUid: string;
20
28
  picked: null | {
@@ -8,7 +8,8 @@ import * as cheerio from "cheerio";
8
8
  import { AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, iterLoadSenatAgendas } from "../loaders";
9
9
  import { getSessionsFromStart } from "../types/sessions";
10
10
  import { commonOptions } from "./shared/cli_helpers";
11
- import { buildSenatVodMasterM3u8FromNvs, getAgendaSegmentTimecodes } from "../utils/nvs-parsing";
11
+ import { getAgendaSegmentTimecodes } from "../utils/nvs-timecode";
12
+ import { decodeHtmlEntities } from "../utils/string_cleaning";
12
13
  import { matchOneReunion } from "../utils/scoring";
13
14
  import { epochToParisDateTime, toFRDate, toTargetEpoch } from "../utils/date";
14
15
  import { pathToFileURL } from "url";
@@ -76,6 +77,39 @@ function extractCandidatesFromSearchHtml(html) {
76
77
  return true;
77
78
  });
78
79
  }
80
+ export function parseDataNvs(nvs) {
81
+ const epochStr = nvs.match(/<metadata\s+name="date"\s+value="(\d+)"/i)?.[1];
82
+ const epoch = epochStr ? Number(epochStr) : undefined;
83
+ // There can be multiple organes for one video in meta
84
+ const organes = [];
85
+ const organesRegex = /<metadata\b[^>]*\bname="organes"[^>]*>/gi;
86
+ let m;
87
+ const salle = decodeHtmlEntities(nvs.match(/<metadata\s+name="salle"\s+value="([^"]+)"/i)?.[1]).trim();
88
+ while ((m = organesRegex.exec(nvs)) !== null) {
89
+ const tag = m[0];
90
+ const label = tag.match(/\blabel="([^"]+)"/i)?.[1];
91
+ if (label) {
92
+ const decoded = decodeHtmlEntities(label).trim();
93
+ if (decoded)
94
+ organes.push(decoded);
95
+ }
96
+ }
97
+ if (organes.length === 0) {
98
+ organes.push("Séance publique");
99
+ }
100
+ const firstChapterLabelMatch = nvs.match(/<chapter\b[^>]*\blabel="([^"]+)"/i);
101
+ const firstChapterLabel = firstChapterLabelMatch ? decodeHtmlEntities(firstChapterLabelMatch[1]).trim() : undefined;
102
+ return { epoch, organes, firstChapterLabel, salle };
103
+ }
104
+ export function buildSenatVodMasterM3u8FromNvs(nvsText) {
105
+ // serverfiles://senat/2025/10/encoder10_20251022084451_2.mp4
106
+ const m = nvsText.match(/serverfiles:\/\/senat\/(\d{4})\/(\d{2})\/(encoder\d+)_([0-9]{14})/i);
107
+ if (!m)
108
+ return null;
109
+ const [, yyyy, mm, encoder, stamp] = m;
110
+ const base = `https://vodsenat.akamaized.net/senat/${yyyy}/${mm}/${encoder}_${stamp}`;
111
+ return `${base}.smil/master.m3u8`;
112
+ }
79
113
  export function isAmbiguousTimeOriginal(timeOriginal) {
80
114
  if (!timeOriginal)
81
115
  return false;
@@ -121,6 +155,45 @@ export async function fetchAllSearchPages(args, maxPages = 3) {
121
155
  }
122
156
  return pages;
123
157
  }
158
+ export function getOrgKey(norm) {
159
+ if (!norm)
160
+ return "autre";
161
+ if (norm.includes("seance publique"))
162
+ return "seance_publique";
163
+ if (norm.includes("culture"))
164
+ return "culture";
165
+ if (norm.includes("finances"))
166
+ return "finances";
167
+ if (norm.includes("sociales"))
168
+ return "affaires_sociales";
169
+ if (norm.includes("economiques"))
170
+ return "affaires_economiques";
171
+ if (norm.includes("europeennes"))
172
+ return "affaires_europeennes";
173
+ if (norm.includes("etrangeres") || norm.includes("forces armees") || norm.includes("defense")) {
174
+ return "affaires_etrangeres_defense";
175
+ }
176
+ if (norm.includes("territoire") || norm.includes("durable")) {
177
+ return "amenagement_territoire_dd";
178
+ }
179
+ if (norm.includes("commission des lois"))
180
+ return "lois";
181
+ if (norm.includes("delegation aux collectivites territoriales") || norm.includes("delegation a la decentralisation"))
182
+ return "delegation_collectivites";
183
+ if (norm.includes("delegation aux droits des femmes") ||
184
+ norm.includes("egalite des chances entre les hommes et les femmes"))
185
+ return "delegation_droits_femmes";
186
+ if (norm.includes("delegation aux entreprises"))
187
+ return "delegation_entreprises";
188
+ if (norm.includes("delegation senatoriale aux outre mer") || norm.includes("delegation aux outre mer"))
189
+ return "delegation_outre_mer";
190
+ if (norm.includes("delegation a la prospective"))
191
+ return "delegation_prospective";
192
+ if (norm.includes("office parlementaire d evaluation des choix scientifiques et technologiques") ||
193
+ norm.includes("opecst"))
194
+ return "opecst";
195
+ return "autre";
196
+ }
124
197
  async function processGroupedReunion(agenda, session, dataDir) {
125
198
  // 1) GuardRails
126
199
  if (!agenda.captationVideo) {
@@ -1,5 +1,5 @@
1
1
  import { AgendaEvent, Reunion } from "../types/agenda";
2
- import { L1Chapter } from "./nvs-parsing";
2
+ import { L1Chapter } from "./nvs-timecode";
3
3
  export type MatchWeights = {
4
4
  minAccept?: number;
5
5
  margin?: number;
@@ -60,7 +60,6 @@ export type BestMatch = {
60
60
  m3u8: string;
61
61
  signals: VideoScoreSignals;
62
62
  };
63
- export declare function getOrgKey(norm: string): string;
64
63
  export declare function dice(a: string, b: string): number;
65
64
  export declare function normalize(s?: string | null): string;
66
65
  export declare function normalizeSalle(s?: string | null): string | null;
@@ -1,4 +1,5 @@
1
- import { buildSenatVodMasterM3u8FromNvs, getLevel1Chapters, parseDataNvs } from "./nvs-parsing";
1
+ import { buildSenatVodMasterM3u8FromNvs, getOrgKey, parseDataNvs } from "../scripts/retrieve_videos";
2
+ import { getLevel1Chapters } from "./nvs-timecode";
2
3
  import { parseISO } from "./reunion_parsing";
3
4
  import { normalizeText } from "./string_cleaning";
4
5
  export function jaccard(a, b) {
@@ -103,45 +104,6 @@ function extractHourHints(text) {
103
104
  out.push({ h: Number(m[1]), m: Number(m[2]) });
104
105
  return out;
105
106
  }
106
- export function getOrgKey(norm) {
107
- if (!norm)
108
- return "autre";
109
- if (norm.includes("seance publique"))
110
- return "seance_publique";
111
- if (norm.includes("culture"))
112
- return "culture";
113
- if (norm.includes("finances"))
114
- return "finances";
115
- if (norm.includes("sociales"))
116
- return "affaires_sociales";
117
- if (norm.includes("economiques"))
118
- return "affaires_economiques";
119
- if (norm.includes("europeennes"))
120
- return "affaires_europeennes";
121
- if (norm.includes("etrangeres") || norm.includes("forces armees") || norm.includes("defense")) {
122
- return "affaires_etrangeres_defense";
123
- }
124
- if (norm.includes("territoire") || norm.includes("durable")) {
125
- return "amenagement_territoire_dd";
126
- }
127
- if (norm.includes("commission des lois"))
128
- return "lois";
129
- if (norm.includes("delegation aux collectivites territoriales") || norm.includes("delegation a la decentralisation"))
130
- return "delegation_collectivites";
131
- if (norm.includes("delegation aux droits des femmes") ||
132
- norm.includes("egalite des chances entre les hommes et les femmes"))
133
- return "delegation_droits_femmes";
134
- if (norm.includes("delegation aux entreprises"))
135
- return "delegation_entreprises";
136
- if (norm.includes("delegation senatoriale aux outre mer") || norm.includes("delegation aux outre mer"))
137
- return "delegation_outre_mer";
138
- if (norm.includes("delegation a la prospective"))
139
- return "delegation_prospective";
140
- if (norm.includes("office parlementaire d evaluation des choix scientifiques et technologiques") ||
141
- norm.includes("opecst"))
142
- return "opecst";
143
- return "autre";
144
- }
145
107
  function tokensDice(s) {
146
108
  return normalize(s).split(" ").filter(Boolean);
147
109
  }
@@ -8,10 +8,10 @@ import { describe, it, expect } from "vitest";
8
8
  import * as fs from "node:fs/promises";
9
9
  import * as path from "node:path";
10
10
  import * as cheerio from "cheerio";
11
- import { fetchBuffer, fetchText, getAgendaType, isAmbiguousTimeOriginal, queryString, SENAT_DATAS_ROOT, SENAT_VIDEOS_SEARCH_AJAX, } from "../src/scripts/retrieve_videos";
11
+ import { buildSenatVodMasterM3u8FromNvs, fetchBuffer, fetchText, getAgendaType, getOrgKey, isAmbiguousTimeOriginal, parseDataNvs, queryString, SENAT_DATAS_ROOT, SENAT_VIDEOS_SEARCH_AJAX, } from "../src/scripts/retrieve_videos";
12
12
  import { toFRDate } from "../src/utils/date";
13
- import { dice, getOrgKey, normalize, scoreVideo } from "../src/utils/scoring";
14
- import { buildSenatVodMasterM3u8FromNvs, getLevel1Chapters, parseDataNvs } from "../src/utils/nvs-parsing";
13
+ import { dice, normalize, scoreVideo } from "../src/utils/scoring";
14
+ import { getLevel1Chapters } from "../src/utils/nvs-timecode";
15
15
  const LIVE_CACHE_DIR = path.join(process.cwd(), "tests", ".cache", "video-matching-live");
16
16
  const FIXTURES_ROOT = path.join(process.cwd(), "tests", "fixtures", "data");
17
17
  const GOLD_PATH = path.join(FIXTURES_ROOT, "expected-video-matching.json");
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tricoteuses/senat",
3
- "version": "2.20.35",
3
+ "version": "2.21.0",
4
4
  "description": "Handle French Sénat's open data",
5
5
  "keywords": [
6
6
  "France",
@@ -65,8 +65,7 @@
65
65
  "prettier": "prettier --write 'src/**/*.ts'",
66
66
  "test:iter_load": "tsx src/scripts/test_iter_load.ts",
67
67
  "type-check": "tsc --noEmit",
68
- "type-check:watch": "npm run type-check -- --watch",
69
- "test:video-matching": "vitest tests/videoMatching.test.ts"
68
+ "type-check:watch": "npm run type-check -- --watch"
70
69
  },
71
70
  "dependencies": {
72
71
  "cheerio": "^1.1.2",
@@ -83,9 +82,8 @@
83
82
  "pg-cursor": "^2.12.1",
84
83
  "slug": "^11.0.0",
85
84
  "tsx": "^4.21.0",
86
- "zod": "^4.3.5",
87
- "vitest": "^4.0.18",
88
- "windows-1252": "^3.0.4"
85
+ "windows-1252": "^3.0.4",
86
+ "zod": "^4.3.5"
89
87
  },
90
88
  "devDependencies": {
91
89
  "@typed-code/schemats": "^5.0.1",
@@ -1,24 +0,0 @@
1
- export type L1Chapter = {
2
- id: string;
3
- label: string;
4
- index: number;
5
- };
6
- export declare function getLevel1Chapters(dataNvs: string): L1Chapter[];
7
- export declare function pickBestLevel1ChapterForAgenda(chapters: L1Chapter[], agendaTitle: string): {
8
- chapter: L1Chapter;
9
- score: number;
10
- } | null;
11
- export declare function getAgendaSegmentTimecodes(dataNvs: string, finalPlayerNvs: string, agendaTitleOrObjet: string): {
12
- start: number;
13
- end: number | null;
14
- chapterId: string;
15
- nextChapterId: string | null;
16
- score: number;
17
- } | null;
18
- export declare function parseDataNvs(nvs: string): {
19
- epoch?: number;
20
- organes: string[];
21
- firstChapterLabel?: string;
22
- salle?: string;
23
- };
24
- export declare function buildSenatVodMasterM3u8FromNvs(nvsText: string): string | null;
@@ -1,112 +0,0 @@
1
- import { XMLParser } from "fast-xml-parser";
2
- import { dice, normalize } from "./scoring";
3
- import { decodeHtmlEntities } from "./string_cleaning";
4
- const CHAPTER_MATCH_THRESHOLD = 0.5;
5
- const xmlParser = new XMLParser({
6
- ignoreAttributes: false,
7
- attributeNamePrefix: "@_",
8
- });
9
- function getTimecodeForChapterId(finalPlayerNvs, chapterId) {
10
- const xml = xmlParser.parse(finalPlayerNvs);
11
- const synchros = xml?.player?.synchro;
12
- if (!synchros)
13
- return null;
14
- const synchsArray = Array.isArray(synchros) ? synchros : [synchros];
15
- const match = synchsArray.find((s) => String(s["@_id"]) === String(chapterId));
16
- if (!match)
17
- return null;
18
- const rawTimecode = match["@_timecode"];
19
- if (rawTimecode == null)
20
- return null;
21
- const ms = Number(rawTimecode);
22
- if (Number.isNaN(ms))
23
- return null;
24
- return Math.floor(ms / 1000);
25
- }
26
- function toArray(v) {
27
- if (!v)
28
- return [];
29
- return Array.isArray(v) ? v : [v];
30
- }
31
- export function getLevel1Chapters(dataNvs) {
32
- const xml = xmlParser.parse(dataNvs);
33
- const root = xml?.data?.chapters?.chapter ?? xml?.chapters?.chapter;
34
- const roots = toArray(root);
35
- return roots
36
- .map((ch, i) => {
37
- const id = ch?.id ?? ch?.["@_id"];
38
- const labelRaw = ch?.label ?? ch?.["@_label"] ?? "";
39
- return {
40
- id: String(id),
41
- label: decodeHtmlEntities(String(labelRaw)).trim(),
42
- index: i,
43
- };
44
- })
45
- .filter((c) => c.id && c.label);
46
- }
47
- export function pickBestLevel1ChapterForAgenda(chapters, agendaTitle) {
48
- const q = normalize(agendaTitle);
49
- let best = null;
50
- for (const ch of chapters) {
51
- const s = dice(q, ch.label);
52
- if (!best || s > best.score)
53
- best = { chapter: ch, score: s };
54
- }
55
- if (!best || best.score < CHAPTER_MATCH_THRESHOLD)
56
- return { chapter: chapters[0], score: 0 };
57
- return best;
58
- }
59
- export function getAgendaSegmentTimecodes(dataNvs, finalPlayerNvs, agendaTitleOrObjet) {
60
- const l1 = getLevel1Chapters(dataNvs);
61
- if (!l1.length)
62
- return null;
63
- const best = pickBestLevel1ChapterForAgenda(l1, agendaTitleOrObjet);
64
- if (!best)
65
- return null;
66
- const chapter = best.chapter;
67
- const next = l1[chapter.index + 1] ?? null;
68
- const start = getTimecodeForChapterId(finalPlayerNvs, chapter.id);
69
- if (start == null)
70
- return null;
71
- const end = next ? getTimecodeForChapterId(finalPlayerNvs, next.id) : null;
72
- return {
73
- start,
74
- end,
75
- chapterId: chapter.id,
76
- nextChapterId: next?.id ?? null,
77
- score: best.score,
78
- };
79
- }
80
- export function parseDataNvs(nvs) {
81
- const epochStr = nvs.match(/<metadata\s+name="date"\s+value="(\d+)"/i)?.[1];
82
- const epoch = epochStr ? Number(epochStr) : undefined;
83
- // There can be multiple organes for one video in meta
84
- const organes = [];
85
- const organesRegex = /<metadata\b[^>]*\bname="organes"[^>]*>/gi;
86
- let m;
87
- const salle = decodeHtmlEntities(nvs.match(/<metadata\s+name="salle"\s+value="([^"]+)"/i)?.[1]).trim();
88
- while ((m = organesRegex.exec(nvs)) !== null) {
89
- const tag = m[0];
90
- const label = tag.match(/\blabel="([^"]+)"/i)?.[1];
91
- if (label) {
92
- const decoded = decodeHtmlEntities(label).trim();
93
- if (decoded)
94
- organes.push(decoded);
95
- }
96
- }
97
- if (organes.length === 0) {
98
- organes.push("Séance publique");
99
- }
100
- const firstChapterLabelMatch = nvs.match(/<chapter\b[^>]*\blabel="([^"]+)"/i);
101
- const firstChapterLabel = firstChapterLabelMatch ? decodeHtmlEntities(firstChapterLabelMatch[1]).trim() : undefined;
102
- return { epoch, organes, firstChapterLabel, salle };
103
- }
104
- export function buildSenatVodMasterM3u8FromNvs(nvsText) {
105
- // serverfiles://senat/2025/10/encoder10_20251022084451_2.mp4
106
- const m = nvsText.match(/serverfiles:\/\/senat\/(\d{4})\/(\d{2})\/(encoder\d+)_([0-9]{14})/i);
107
- if (!m)
108
- return null;
109
- const [, yyyy, mm, encoder, stamp] = m;
110
- const base = `https://vodsenat.akamaized.net/senat/${yyyy}/${mm}/${encoder}_${stamp}`;
111
- return `${base}.smil/master.m3u8`;
112
- }