@tricoteuses/senat 2.22.9 → 2.22.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -101,6 +101,9 @@ async function processDocument(url, destPath, docDate, options) {
|
|
|
101
101
|
}
|
|
102
102
|
export async function processTexte(texteMetadata, originalTextesDir, transformedTextesDir, enrichedTextesDir, options) {
|
|
103
103
|
const texteDir = path.join(originalTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name);
|
|
104
|
+
// Pre-compute whether the parsed JSON output already exists, to avoid re-parsing unchanged files
|
|
105
|
+
const parsedJsonPath = path.join(transformedTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name, `${texteMetadata.name}.json`);
|
|
106
|
+
const parsedOutputExists = options.parseDocuments ? await fs.pathExists(parsedJsonPath) : false;
|
|
104
107
|
let exposeDesMotifsContent = null;
|
|
105
108
|
if (texteMetadata.url_expose_des_motifs) {
|
|
106
109
|
const exposePath = path.join(texteDir, `${texteMetadata.name}-expose.html`);
|
|
@@ -108,7 +111,8 @@ export async function processTexte(texteMetadata, originalTextesDir, transformed
|
|
|
108
111
|
if (res.buffer) {
|
|
109
112
|
exposeDesMotifsContent = res.buffer;
|
|
110
113
|
}
|
|
111
|
-
else if (res.skipped && options.parseDocuments) {
|
|
114
|
+
else if (res.skipped && options.parseDocuments && !parsedOutputExists) {
|
|
115
|
+
// Only load expose from disk if we'll actually need to re-parse
|
|
112
116
|
if (await fs.pathExists(exposePath)) {
|
|
113
117
|
exposeDesMotifsContent = await fs.readFile(exposePath);
|
|
114
118
|
}
|
|
@@ -128,23 +132,28 @@ export async function processTexte(texteMetadata, originalTextesDir, transformed
|
|
|
128
132
|
const result = await processDocument(format.url.toString(), destPath, texteMetadata.date, options);
|
|
129
133
|
// Specific logic: Parsing (Only applies to XML)
|
|
130
134
|
if (format.isParseTarget && options.parseDocuments) {
|
|
131
|
-
if
|
|
135
|
+
// Skip re-parsing if the XML was not newly downloaded AND the parsed output already exists
|
|
136
|
+
const needsParsing = !result.skipped || !parsedOutputExists;
|
|
137
|
+
if (needsParsing && (result.buffer !== null || (await fs.pathExists(destPath)))) {
|
|
132
138
|
await parseDocument(texteMetadata.session, transformedTextesDir, destPath, texteMetadata.name, result.buffer, exposeDesMotifsContent, options);
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
texteXmlContent = await fs.readFile(destPath, "utf-8");
|
|
137
|
-
}
|
|
138
|
-
else if (result.buffer !== null) {
|
|
139
|
-
texteXmlContent = textDecoder.decode(result.buffer);
|
|
140
|
-
}
|
|
141
|
-
if (texteXmlContent !== null) {
|
|
142
|
-
try {
|
|
143
|
-
await convertSenatXmlToHtml(texteXmlContent, path.join(enrichedTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name, `${texteMetadata.name}.html`));
|
|
139
|
+
let texteXmlContent = null;
|
|
140
|
+
if (result.buffer === null && (await fs.pathExists(destPath))) {
|
|
141
|
+
texteXmlContent = await fs.readFile(destPath, "utf-8");
|
|
144
142
|
}
|
|
145
|
-
|
|
146
|
-
|
|
143
|
+
else if (result.buffer !== null) {
|
|
144
|
+
texteXmlContent = textDecoder.decode(result.buffer);
|
|
147
145
|
}
|
|
146
|
+
if (texteXmlContent !== null) {
|
|
147
|
+
try {
|
|
148
|
+
await convertSenatXmlToHtml(texteXmlContent, path.join(enrichedTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name, `${texteMetadata.name}.html`));
|
|
149
|
+
}
|
|
150
|
+
catch (error) {
|
|
151
|
+
console.error(`Error converting ${texteMetadata.name} to HTML: ${error.message}`);
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
else if (options.verbose) {
|
|
156
|
+
console.info(`Skipping parse for already processed texte ${texteMetadata.name}…`);
|
|
148
157
|
}
|
|
149
158
|
}
|
|
150
159
|
}
|
|
@@ -13,6 +13,9 @@ export declare function isNoiseBlock(text: string): boolean;
|
|
|
13
13
|
export declare function scoreSommaireBlockForEvent(blockText: string, ev: AgendaEvent): number;
|
|
14
14
|
export declare function getOrgKey(norm: string): string;
|
|
15
15
|
export declare function dice(a: string, b: string): number;
|
|
16
|
+
export declare function coverage(reference?: string | null, candidate?: string | null): number;
|
|
17
|
+
export declare function diceFiltered(a?: string | null, b?: string | null): number;
|
|
18
|
+
export declare function similarityScore(a?: string | null, b?: string | null): number;
|
|
16
19
|
export declare function normalize(s?: string | null): string;
|
|
17
20
|
export declare function normalizeSalle(s?: string | null): string | null;
|
|
18
21
|
export declare function scoreVideo(agenda: Reunion, agendaTs: number | null, sameOrg: boolean, w: VideoScoreWeights, videoTitle?: string, videoEpoch?: number, videoOrganes?: string[], timeAmbigious?: boolean, salle?: string, chapterTitles?: L1Chapter[]): {
|
package/lib/src/utils/scoring.js
CHANGED
|
@@ -154,6 +154,38 @@ export function dice(a, b) {
|
|
|
154
154
|
inter++;
|
|
155
155
|
return (2 * inter) / (A.size + B.size);
|
|
156
156
|
}
|
|
157
|
+
function tokenSet(s) {
|
|
158
|
+
if (!s)
|
|
159
|
+
return new Set();
|
|
160
|
+
return new Set(tokens(s));
|
|
161
|
+
}
|
|
162
|
+
export function coverage(reference, candidate) {
|
|
163
|
+
const A = tokenSet(reference);
|
|
164
|
+
const B = tokenSet(candidate);
|
|
165
|
+
if (!A.size || !B.size)
|
|
166
|
+
return 0;
|
|
167
|
+
let inter = 0;
|
|
168
|
+
for (const t of A)
|
|
169
|
+
if (B.has(t))
|
|
170
|
+
inter++;
|
|
171
|
+
return inter / A.size;
|
|
172
|
+
}
|
|
173
|
+
export function diceFiltered(a, b) {
|
|
174
|
+
const A = tokenSet(a);
|
|
175
|
+
const B = tokenSet(b);
|
|
176
|
+
if (!A.size || !B.size)
|
|
177
|
+
return 0;
|
|
178
|
+
let inter = 0;
|
|
179
|
+
for (const t of A)
|
|
180
|
+
if (B.has(t))
|
|
181
|
+
inter++;
|
|
182
|
+
return (2 * inter) / (A.size + B.size);
|
|
183
|
+
}
|
|
184
|
+
export function similarityScore(a, b) {
|
|
185
|
+
const cov = coverage(a, b);
|
|
186
|
+
const d = diceFiltered(a, b);
|
|
187
|
+
return 0.7 * cov + 0.3 * d;
|
|
188
|
+
}
|
|
157
189
|
export function normalize(s) {
|
|
158
190
|
return (s ?? "")
|
|
159
191
|
.toLowerCase()
|
|
@@ -174,13 +206,13 @@ export function normalizeSalle(s) {
|
|
|
174
206
|
}
|
|
175
207
|
export function scoreVideo(agenda, agendaTs, sameOrg, w, videoTitle, videoEpoch, videoOrganes, timeAmbigious = false, salle, chapterTitles) {
|
|
176
208
|
const weights = w;
|
|
177
|
-
const objetS =
|
|
178
|
-
const titleS =
|
|
209
|
+
const objetS = similarityScore(agenda.objet || "", videoTitle || "");
|
|
210
|
+
const titleS = similarityScore(agenda.titre || "", videoTitle || "");
|
|
179
211
|
let titleScore = Math.max(objetS, titleS);
|
|
180
212
|
chapterTitles = chapterTitles || [];
|
|
181
213
|
for (const ch of chapterTitles) {
|
|
182
|
-
const chObjetS =
|
|
183
|
-
const chTitreS =
|
|
214
|
+
const chObjetS = similarityScore(agenda.objet || "", ch.label);
|
|
215
|
+
const chTitreS = similarityScore(agenda.titre || "", ch.label);
|
|
184
216
|
titleScore = Math.max(titleScore, Math.max(chObjetS, chTitreS));
|
|
185
217
|
}
|
|
186
218
|
let timeScore = 0;
|
|
@@ -190,8 +222,7 @@ export function scoreVideo(agenda, agendaTs, sameOrg, w, videoTitle, videoEpoch,
|
|
|
190
222
|
}
|
|
191
223
|
let orgScore = 0;
|
|
192
224
|
if (agenda.organe && videoOrganes?.length) {
|
|
193
|
-
|
|
194
|
-
orgScore = Math.max(...videoOrganes.map((v) => dice(agenda.organe, v)));
|
|
225
|
+
orgScore = Math.max(...videoOrganes.map((v) => similarityScore(agenda.organe, v)));
|
|
195
226
|
}
|
|
196
227
|
// Salle: normalized "A263" matching
|
|
197
228
|
let salleScore = 0;
|
|
@@ -12,8 +12,8 @@ export type MatchResult = {
|
|
|
12
12
|
reason?: "margin_ambiguous_time_sp";
|
|
13
13
|
};
|
|
14
14
|
export type MatchWeights = {
|
|
15
|
-
minAccept
|
|
16
|
-
margin
|
|
15
|
+
minAccept: number;
|
|
16
|
+
margin: number;
|
|
17
17
|
titleDominance?: number;
|
|
18
18
|
orgUncertainPenalty?: number;
|
|
19
19
|
orgSkipDice?: number;
|