@tricoteuses/senat 2.22.9 → 2.22.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -101,6 +101,9 @@ async function processDocument(url, destPath, docDate, options) {
101
101
  }
102
102
  export async function processTexte(texteMetadata, originalTextesDir, transformedTextesDir, enrichedTextesDir, options) {
103
103
  const texteDir = path.join(originalTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name);
104
+ // Pre-compute whether the parsed JSON output already exists, to avoid re-parsing unchanged files
105
+ const parsedJsonPath = path.join(transformedTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name, `${texteMetadata.name}.json`);
106
+ const parsedOutputExists = options.parseDocuments ? await fs.pathExists(parsedJsonPath) : false;
104
107
  let exposeDesMotifsContent = null;
105
108
  if (texteMetadata.url_expose_des_motifs) {
106
109
  const exposePath = path.join(texteDir, `${texteMetadata.name}-expose.html`);
@@ -108,7 +111,8 @@ export async function processTexte(texteMetadata, originalTextesDir, transformed
108
111
  if (res.buffer) {
109
112
  exposeDesMotifsContent = res.buffer;
110
113
  }
111
- else if (res.skipped && options.parseDocuments) {
114
+ else if (res.skipped && options.parseDocuments && !parsedOutputExists) {
115
+ // Only load expose from disk if we'll actually need to re-parse
112
116
  if (await fs.pathExists(exposePath)) {
113
117
  exposeDesMotifsContent = await fs.readFile(exposePath);
114
118
  }
@@ -128,23 +132,28 @@ export async function processTexte(texteMetadata, originalTextesDir, transformed
128
132
  const result = await processDocument(format.url.toString(), destPath, texteMetadata.date, options);
129
133
  // Specific logic: Parsing (Only applies to XML)
130
134
  if (format.isParseTarget && options.parseDocuments) {
131
- if (result.buffer !== null || (await fs.pathExists(destPath))) {
135
+ // Skip re-parsing if the XML was not newly downloaded AND the parsed output already exists
136
+ const needsParsing = !result.skipped || !parsedOutputExists;
137
+ if (needsParsing && (result.buffer !== null || (await fs.pathExists(destPath)))) {
132
138
  await parseDocument(texteMetadata.session, transformedTextesDir, destPath, texteMetadata.name, result.buffer, exposeDesMotifsContent, options);
133
- }
134
- let texteXmlContent = null;
135
- if (result.buffer === null && (await fs.pathExists(destPath))) {
136
- texteXmlContent = await fs.readFile(destPath, "utf-8");
137
- }
138
- else if (result.buffer !== null) {
139
- texteXmlContent = textDecoder.decode(result.buffer);
140
- }
141
- if (texteXmlContent !== null) {
142
- try {
143
- await convertSenatXmlToHtml(texteXmlContent, path.join(enrichedTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name, `${texteMetadata.name}.html`));
139
+ let texteXmlContent = null;
140
+ if (result.buffer === null && (await fs.pathExists(destPath))) {
141
+ texteXmlContent = await fs.readFile(destPath, "utf-8");
144
142
  }
145
- catch (error) {
146
- console.error(`Error converting ${texteMetadata.name} to HTML: ${error.message}`);
143
+ else if (result.buffer !== null) {
144
+ texteXmlContent = textDecoder.decode(result.buffer);
147
145
  }
146
+ if (texteXmlContent !== null) {
147
+ try {
148
+ await convertSenatXmlToHtml(texteXmlContent, path.join(enrichedTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name, `${texteMetadata.name}.html`));
149
+ }
150
+ catch (error) {
151
+ console.error(`Error converting ${texteMetadata.name} to HTML: ${error.message}`);
152
+ }
153
+ }
154
+ }
155
+ else if (options.verbose) {
156
+ console.info(`Skipping parse for already processed texte ${texteMetadata.name}…`);
148
157
  }
149
158
  }
150
159
  }
@@ -13,6 +13,9 @@ export declare function isNoiseBlock(text: string): boolean;
13
13
  export declare function scoreSommaireBlockForEvent(blockText: string, ev: AgendaEvent): number;
14
14
  export declare function getOrgKey(norm: string): string;
15
15
  export declare function dice(a: string, b: string): number;
16
+ export declare function coverage(reference?: string | null, candidate?: string | null): number;
17
+ export declare function diceFiltered(a?: string | null, b?: string | null): number;
18
+ export declare function similarityScore(a?: string | null, b?: string | null): number;
16
19
  export declare function normalize(s?: string | null): string;
17
20
  export declare function normalizeSalle(s?: string | null): string | null;
18
21
  export declare function scoreVideo(agenda: Reunion, agendaTs: number | null, sameOrg: boolean, w: VideoScoreWeights, videoTitle?: string, videoEpoch?: number, videoOrganes?: string[], timeAmbigious?: boolean, salle?: string, chapterTitles?: L1Chapter[]): {
@@ -154,6 +154,38 @@ export function dice(a, b) {
154
154
  inter++;
155
155
  return (2 * inter) / (A.size + B.size);
156
156
  }
157
+ function tokenSet(s) {
158
+ if (!s)
159
+ return new Set();
160
+ return new Set(tokens(s));
161
+ }
162
+ export function coverage(reference, candidate) {
163
+ const A = tokenSet(reference);
164
+ const B = tokenSet(candidate);
165
+ if (!A.size || !B.size)
166
+ return 0;
167
+ let inter = 0;
168
+ for (const t of A)
169
+ if (B.has(t))
170
+ inter++;
171
+ return inter / A.size;
172
+ }
173
+ export function diceFiltered(a, b) {
174
+ const A = tokenSet(a);
175
+ const B = tokenSet(b);
176
+ if (!A.size || !B.size)
177
+ return 0;
178
+ let inter = 0;
179
+ for (const t of A)
180
+ if (B.has(t))
181
+ inter++;
182
+ return (2 * inter) / (A.size + B.size);
183
+ }
184
+ export function similarityScore(a, b) {
185
+ const cov = coverage(a, b);
186
+ const d = diceFiltered(a, b);
187
+ return 0.7 * cov + 0.3 * d;
188
+ }
157
189
  export function normalize(s) {
158
190
  return (s ?? "")
159
191
  .toLowerCase()
@@ -174,13 +206,13 @@ export function normalizeSalle(s) {
174
206
  }
175
207
  export function scoreVideo(agenda, agendaTs, sameOrg, w, videoTitle, videoEpoch, videoOrganes, timeAmbigious = false, salle, chapterTitles) {
176
208
  const weights = w;
177
- const objetS = dice(agenda.objet || "", videoTitle || "");
178
- const titleS = dice(agenda.titre || "", videoTitle || "");
209
+ const objetS = similarityScore(agenda.objet || "", videoTitle || "");
210
+ const titleS = similarityScore(agenda.titre || "", videoTitle || "");
179
211
  let titleScore = Math.max(objetS, titleS);
180
212
  chapterTitles = chapterTitles || [];
181
213
  for (const ch of chapterTitles) {
182
- const chObjetS = dice(agenda.objet || "", ch.label);
183
- const chTitreS = dice(agenda.titre || "", ch.label);
214
+ const chObjetS = similarityScore(agenda.objet || "", ch.label);
215
+ const chTitreS = similarityScore(agenda.titre || "", ch.label);
184
216
  titleScore = Math.max(titleScore, Math.max(chObjetS, chTitreS));
185
217
  }
186
218
  let timeScore = 0;
@@ -190,8 +222,7 @@ export function scoreVideo(agenda, agendaTs, sameOrg, w, videoTitle, videoEpoch,
190
222
  }
191
223
  let orgScore = 0;
192
224
  if (agenda.organe && videoOrganes?.length) {
193
- // NOTE: if you already normalize organes elsewhere, keep it there.
194
- orgScore = Math.max(...videoOrganes.map((v) => dice(agenda.organe, v)));
225
+ orgScore = Math.max(...videoOrganes.map((v) => similarityScore(agenda.organe, v)));
195
226
  }
196
227
  // Salle: normalized "A263" matching
197
228
  let salleScore = 0;
@@ -12,8 +12,8 @@ export type MatchResult = {
12
12
  reason?: "margin_ambiguous_time_sp";
13
13
  };
14
14
  export type MatchWeights = {
15
- minAccept?: number;
16
- margin?: number;
15
+ minAccept: number;
16
+ margin: number;
17
17
  titleDominance?: number;
18
18
  orgUncertainPenalty?: number;
19
19
  orgSkipDice?: number;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tricoteuses/senat",
3
- "version": "2.22.9",
3
+ "version": "2.22.11",
4
4
  "description": "Handle French Sénat's open data",
5
5
  "keywords": [
6
6
  "France",