@tricoteuses/senat 2.20.17 → 2.20.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +58 -19
- package/lib/git.d.ts +26 -0
- package/lib/git.js +167 -0
- package/lib/index.d.ts +1 -1
- package/lib/loaders.d.ts +3 -2
- package/lib/model/commission.d.ts +2 -2
- package/lib/model/commission.js +5 -4
- package/lib/model/seance.d.ts +2 -8
- package/lib/model/seance.js +28 -113
- package/lib/model/util.d.ts +0 -4
- package/lib/model/util.js +0 -38
- package/lib/scripts/convert_data.js +25 -1
- package/lib/scripts/retrieve_agenda.js +7 -18
- package/lib/scripts/retrieve_cr_commission.js +1 -10
- package/lib/scripts/retrieve_cr_seance.d.ts +1 -1
- package/lib/scripts/retrieve_cr_seance.js +183 -127
- package/lib/scripts/retrieve_videos.d.ts +1 -1
- package/lib/scripts/retrieve_videos.js +46 -92
- package/lib/scripts/shared/cli_helpers.d.ts +25 -3
- package/lib/scripts/shared/cli_helpers.js +28 -0
- package/lib/types/agenda.d.ts +5 -6
- package/lib/utils/cr_spliting.d.ts +2 -10
- package/lib/utils/cr_spliting.js +2 -119
- package/lib/utils/date.d.ts +10 -0
- package/lib/utils/date.js +100 -0
- package/lib/utils/reunion_odj_building.d.ts +2 -2
- package/lib/utils/reunion_odj_building.js +8 -12
- package/lib/utils/reunion_parsing.d.ts +23 -0
- package/lib/utils/reunion_parsing.js +209 -0
- package/lib/utils/scoring.d.ts +14 -0
- package/lib/utils/scoring.js +147 -0
- package/lib/utils/string_cleaning.d.ts +7 -0
- package/lib/utils/string_cleaning.js +57 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -29,40 +29,79 @@ docker run --name local-postgres -d -p 5432:5432 -e POSTGRES_PASSWORD=$YOUR_CUST
|
|
|
29
29
|
|
|
30
30
|
## Download data
|
|
31
31
|
|
|
32
|
+
### Basic usage
|
|
33
|
+
|
|
32
34
|
Create a folder where the data will be downloaded and run the following command to download the data and convert it into JSON files.
|
|
33
35
|
|
|
34
36
|
```bash
|
|
35
37
|
mkdir ../senat-data/
|
|
36
38
|
|
|
37
|
-
|
|
38
|
-
npm run data:download ../senat-data -- [--categories All]
|
|
39
|
+
npm run data:download ../senat-data
|
|
39
40
|
```
|
|
40
41
|
|
|
41
|
-
|
|
42
|
+
### Available Commands
|
|
43
|
+
|
|
44
|
+
- `npm run data:download <dir>`: Download, convert data to JSON
|
|
45
|
+
- `npm run data:retrieve_documents <dir>`: Retrieval of textes and rapports from Sénat's website
|
|
46
|
+
- `npm run data:parse_textes_lois <dir>`: Parse textes (requires xml files)
|
|
47
|
+
- `npm run data:retrieve_agenda <dir>`: Retrieval of agenda from Sénat's website
|
|
48
|
+
- `npm run data:retrieve_cr_seance <dir>`: Retrieval of comptes-rendus de séance from Sénat's data
|
|
49
|
+
- `npm run data:retrieve_cr_commission <dir>`: Retrieval of comptes-rendus de commissions from Sénat's website
|
|
50
|
+
- `npm run data:retrieve_senateurs_photos <dir>`: Retrieval of sénateurs' pictures from Sénat's website
|
|
51
|
+
|
|
52
|
+
### Filtering Options
|
|
53
|
+
|
|
54
|
+
Downloading all the data is long and takes up a lot of disk space. It is possible to choose the type of data that you want to retrieve to reduce the load.
|
|
55
|
+
|
|
56
|
+
Examples:
|
|
42
57
|
|
|
43
58
|
```bash
|
|
44
|
-
#
|
|
45
|
-
|
|
46
|
-
# Available options for optional `types` parameter : textes, rapports
|
|
47
|
-
npm run data:retrieve_documents ../senat-data -- --fromSession 2022 [--formats xml pdf] [--types textes]
|
|
59
|
+
# Only download amendments
|
|
60
|
+
npm run data:download ../senat-data -- -k Ameli
|
|
48
61
|
|
|
49
|
-
#
|
|
50
|
-
npm run data:
|
|
62
|
+
# Only process data from session 2023 onwards
|
|
63
|
+
npm run data:download ../senat-data -- --fromSession 2023
|
|
64
|
+
```
|
|
51
65
|
|
|
52
|
-
|
|
53
|
-
npm run data:parse_textes_lois ../senat-data
|
|
66
|
+
### Common Options
|
|
54
67
|
|
|
55
|
-
|
|
56
|
-
|
|
68
|
+
- `--categories` or `-k <name>`: Filter by dataset categories (Available options: `All`, `Ameli`, `Debats`, `DosLeg`, `Questions`, `Sens`)
|
|
69
|
+
- `--fromSession <year>`: Specify the session year to retrieve data from (default: 2022)
|
|
70
|
+
- `--dataDir <path>` (Mandatory): Path to the working directory where all data is stored (required)
|
|
71
|
+
- `--silent` or `-s`: Disable logging
|
|
72
|
+
- `--verbose` or `-v`: Enable verbose logging
|
|
73
|
+
- `--commit` or `-c`: Automatically commit converted data
|
|
74
|
+
- `--pull` or `-p`: Pull repositories before starting
|
|
75
|
+
- `--clone` or `-C <url>`: Clone Git repositories from a remote group or organization
|
|
76
|
+
- `--remote` or `-r <name>`: Push commits to specified Git remote(s)
|
|
77
|
+
- `--keepDir`: Keep directories when cleaning data
|
|
78
|
+
- `--only-recent <days>`: Retrieve only documents created within the last N days
|
|
79
|
+
|
|
80
|
+
### Options for Retrieving Documents
|
|
81
|
+
|
|
82
|
+
- `--formats <format>`: Specify document formats to retrieve (options: `xml`, `html`, `pdf`)
|
|
83
|
+
- `--types <type>`: Specify document types to retrieve (options: `textes`, `rapports`)
|
|
84
|
+
- `--parseDocuments`: Parse documents after retrieval
|
|
85
|
+
- `--parseAgenda`: Parse agenda after retrieval
|
|
86
|
+
- `--parseDebats`: Parse comptes-rendus after retrieval
|
|
87
|
+
|
|
88
|
+
#### Examples
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
# Retrieval of textes and rapports in specific formats
|
|
92
|
+
npm run data:retrieve_documents ../senat-data -- --fromSession 2022 --formats xml pdf --types textes
|
|
93
|
+
|
|
94
|
+
# Retrieval & parsing (textes in xml format only for now)
|
|
95
|
+
npm run data:retrieve_documents ../senat-data -- --fromSession 2022 --parseDocuments
|
|
57
96
|
|
|
58
|
-
# Retrieval
|
|
59
|
-
npm run data:
|
|
97
|
+
# Retrieval & parsing of agenda
|
|
98
|
+
npm run data:retrieve_agenda ../senat-data -- --fromSession 2022 --parseAgenda
|
|
60
99
|
|
|
61
|
-
# Retrieval
|
|
62
|
-
npm run data:
|
|
100
|
+
# Retrieval & parsing of comptes-rendus de séance
|
|
101
|
+
npm run data:retrieve_cr_seance ../senat-data -- --parseDebats --keepDir
|
|
63
102
|
|
|
64
|
-
# Retrieval
|
|
65
|
-
npm run data:
|
|
103
|
+
# Retrieval & parsing of comptes-rendus de commissions
|
|
104
|
+
npm run data:retrieve_cr_commission ../senat-data -- --parseDebats --keepDir
|
|
66
105
|
```
|
|
67
106
|
|
|
68
107
|
## Data download using Docker
|
package/lib/git.d.ts
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
export declare function initRepo(repositoryDir: string): void;
|
|
2
|
+
export declare function commit(repositoryDir: string, message: string): boolean;
|
|
3
|
+
export declare function commitAndPush(repositoryDir: string, message: string, remotes?: string[]): number;
|
|
4
|
+
export declare function resetAndPull(gitDir: string): boolean;
|
|
5
|
+
export declare function clone(gitGroupUrl: string | undefined, gitName: string, workingDir: string): void;
|
|
6
|
+
export declare function run(repositoryDir: string, args: string, verbose?: boolean): string;
|
|
7
|
+
export declare function test(repositoryDir: string, args: string, verbose?: boolean): boolean;
|
|
8
|
+
/**
|
|
9
|
+
* Information about a changed file in git
|
|
10
|
+
*/
|
|
11
|
+
export interface GitChangedFile {
|
|
12
|
+
path: string;
|
|
13
|
+
status: "A" | "M" | "D" | "R" | "C" | "T" | "U";
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Get the list of files that have changed since a specific commit in a git repository.
|
|
17
|
+
* @param repositoryDir The directory of the git repository
|
|
18
|
+
* @param sinceCommit The commit hash to compare against (e.g., "HEAD~1", "abc123", etc.)
|
|
19
|
+
* @param options Options for filtering
|
|
20
|
+
* @param options.diffFilter Git diff-filter string (default: "AMR").
|
|
21
|
+
* A=Added, M=Modified, D=Deleted, R=Renamed, C=Copied, T=Type changed, U=Unmerged
|
|
22
|
+
* @returns A Map of file paths to their git status
|
|
23
|
+
*/
|
|
24
|
+
export declare function getChangedFilesSinceCommit(repositoryDir: string, sinceCommit: string, options?: {
|
|
25
|
+
diffFilter?: string;
|
|
26
|
+
}): Map<string, GitChangedFile["status"]>;
|
package/lib/git.js
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
import { execSync } from "node:child_process";
|
|
2
|
+
import fs from "fs-extra";
|
|
3
|
+
import path from "node:path";
|
|
4
|
+
const MAXBUFFER = 50 * 1024 * 1024;
|
|
5
|
+
export function initRepo(repositoryDir) {
|
|
6
|
+
if (!fs.existsSync(path.join(repositoryDir, ".git"))) {
|
|
7
|
+
fs.ensureDirSync(repositoryDir);
|
|
8
|
+
execSync("git init", {
|
|
9
|
+
cwd: repositoryDir,
|
|
10
|
+
env: process.env,
|
|
11
|
+
encoding: "utf-8",
|
|
12
|
+
stdio: ["ignore", "ignore", "pipe"],
|
|
13
|
+
});
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
export function commit(repositoryDir, message) {
|
|
17
|
+
initRepo(repositoryDir);
|
|
18
|
+
execSync("git add .", {
|
|
19
|
+
cwd: repositoryDir,
|
|
20
|
+
env: process.env,
|
|
21
|
+
encoding: "utf-8",
|
|
22
|
+
stdio: ["ignore", "ignore", "pipe"],
|
|
23
|
+
maxBuffer: MAXBUFFER,
|
|
24
|
+
});
|
|
25
|
+
try {
|
|
26
|
+
execSync(`git commit -m "${message}" --quiet`, {
|
|
27
|
+
cwd: repositoryDir,
|
|
28
|
+
env: process.env,
|
|
29
|
+
encoding: "utf-8",
|
|
30
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
31
|
+
});
|
|
32
|
+
return true;
|
|
33
|
+
}
|
|
34
|
+
catch (childProcess) {
|
|
35
|
+
if (childProcess.stdout === null ||
|
|
36
|
+
!/nothing to commit|rien à valider/.test(childProcess.stdout)) {
|
|
37
|
+
console.error(childProcess.output);
|
|
38
|
+
throw childProcess;
|
|
39
|
+
}
|
|
40
|
+
return false;
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
export function commitAndPush(repositoryDir, message, remotes) {
|
|
44
|
+
let exitCode = 0;
|
|
45
|
+
if (commit(repositoryDir, message)) {
|
|
46
|
+
for (const remote of remotes || []) {
|
|
47
|
+
try {
|
|
48
|
+
execSync(`git push ${remote} master`, {
|
|
49
|
+
cwd: repositoryDir,
|
|
50
|
+
env: process.env,
|
|
51
|
+
encoding: "utf-8",
|
|
52
|
+
stdio: ["ignore", "ignore", "pipe"],
|
|
53
|
+
});
|
|
54
|
+
}
|
|
55
|
+
catch (childProcess) {
|
|
56
|
+
// Don't stop when push fails.
|
|
57
|
+
console.error(childProcess.output);
|
|
58
|
+
exitCode = childProcess.status;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
else {
|
|
63
|
+
// There was nothing to commit.
|
|
64
|
+
exitCode = 10;
|
|
65
|
+
}
|
|
66
|
+
return exitCode;
|
|
67
|
+
}
|
|
68
|
+
export function resetAndPull(gitDir) {
|
|
69
|
+
execSync("git reset --hard origin/master", {
|
|
70
|
+
cwd: gitDir,
|
|
71
|
+
env: process.env,
|
|
72
|
+
encoding: "utf-8",
|
|
73
|
+
stdio: ["ignore", "ignore", "pipe"],
|
|
74
|
+
});
|
|
75
|
+
execSync("git pull --rebase", {
|
|
76
|
+
cwd: gitDir,
|
|
77
|
+
env: process.env,
|
|
78
|
+
encoding: "utf-8",
|
|
79
|
+
stdio: ["ignore", "ignore", "pipe"],
|
|
80
|
+
});
|
|
81
|
+
return true;
|
|
82
|
+
}
|
|
83
|
+
export function clone(gitGroupUrl, gitName, workingDir) {
|
|
84
|
+
if (gitGroupUrl !== undefined) {
|
|
85
|
+
execSync(`git clone ${gitGroupUrl}/${gitName}.git`, {
|
|
86
|
+
cwd: workingDir,
|
|
87
|
+
env: process.env,
|
|
88
|
+
encoding: "utf-8",
|
|
89
|
+
stdio: ["ignore", "ignore", "pipe"],
|
|
90
|
+
});
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
export function run(repositoryDir, args, verbose) {
|
|
94
|
+
try {
|
|
95
|
+
if (verbose)
|
|
96
|
+
console.log(`git -C ${repositoryDir} ${args}`);
|
|
97
|
+
const output = execSync(`git ${args}`, {
|
|
98
|
+
cwd: repositoryDir,
|
|
99
|
+
maxBuffer: MAXBUFFER,
|
|
100
|
+
})
|
|
101
|
+
.toString()
|
|
102
|
+
.trim();
|
|
103
|
+
if (verbose)
|
|
104
|
+
console.log(output);
|
|
105
|
+
return output;
|
|
106
|
+
}
|
|
107
|
+
catch (childProcess) {
|
|
108
|
+
for (const output of ["stdout", "stderr"])
|
|
109
|
+
console.error(`${output}: ${childProcess[output]}`);
|
|
110
|
+
throw childProcess;
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
export function test(repositoryDir, args, verbose) {
|
|
114
|
+
try {
|
|
115
|
+
if (verbose)
|
|
116
|
+
console.log(`git -C ${repositoryDir} ${args}`);
|
|
117
|
+
const output = execSync(`git ${args}`, {
|
|
118
|
+
cwd: repositoryDir,
|
|
119
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
120
|
+
maxBuffer: MAXBUFFER,
|
|
121
|
+
})
|
|
122
|
+
.toString()
|
|
123
|
+
.trim();
|
|
124
|
+
if (verbose)
|
|
125
|
+
console.log(output);
|
|
126
|
+
return true;
|
|
127
|
+
}
|
|
128
|
+
catch (childProcess) {
|
|
129
|
+
if (childProcess.status != 0)
|
|
130
|
+
return false;
|
|
131
|
+
throw childProcess;
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
/**
|
|
135
|
+
* Get the list of files that have changed since a specific commit in a git repository.
|
|
136
|
+
* @param repositoryDir The directory of the git repository
|
|
137
|
+
* @param sinceCommit The commit hash to compare against (e.g., "HEAD~1", "abc123", etc.)
|
|
138
|
+
* @param options Options for filtering
|
|
139
|
+
* @param options.diffFilter Git diff-filter string (default: "AMR").
|
|
140
|
+
* A=Added, M=Modified, D=Deleted, R=Renamed, C=Copied, T=Type changed, U=Unmerged
|
|
141
|
+
* @returns A Map of file paths to their git status
|
|
142
|
+
*/
|
|
143
|
+
export function getChangedFilesSinceCommit(repositoryDir, sinceCommit, options = {}) {
|
|
144
|
+
const { diffFilter } = options;
|
|
145
|
+
try {
|
|
146
|
+
// Using diff-filter: A = Added, M = Modified, R = Renamed, D = Deleted, etc.
|
|
147
|
+
// Default to AMR (excludes deleted files to prevent loading errors)
|
|
148
|
+
const filter = diffFilter ?? "AMR";
|
|
149
|
+
const output = run(repositoryDir, `diff --name-status --diff-filter=${filter} ${sinceCommit}`, false);
|
|
150
|
+
const changedFiles = new Map();
|
|
151
|
+
for (const line of output.split("\n")) {
|
|
152
|
+
if (line.trim().length === 0)
|
|
153
|
+
continue;
|
|
154
|
+
const parts = line.split("\t");
|
|
155
|
+
if (parts.length >= 2) {
|
|
156
|
+
const status = parts[0].charAt(0);
|
|
157
|
+
const path = parts[1];
|
|
158
|
+
changedFiles.set(path, status);
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
return changedFiles;
|
|
162
|
+
}
|
|
163
|
+
catch (error) {
|
|
164
|
+
console.error(`Error getting changed files since commit ${sinceCommit}:`, error);
|
|
165
|
+
return new Map();
|
|
166
|
+
}
|
|
167
|
+
}
|
package/lib/index.d.ts
CHANGED
|
@@ -3,7 +3,7 @@ export type { DossierLegislatifResult } from "./model/dosleg";
|
|
|
3
3
|
export type { ScrutinResult } from "./model/scrutins";
|
|
4
4
|
export type { QuestionResult } from "./model/questions";
|
|
5
5
|
export type { CirconscriptionResult, OrganismeResult, SenateurResult } from "./model/sens";
|
|
6
|
-
export type { AgendaEvent,
|
|
6
|
+
export type { AgendaEvent, Reunion, ReunionOdjPoint } from "./types/agenda";
|
|
7
7
|
export type { Ses, Sub, TxtAmeli } from "./types/ameli";
|
|
8
8
|
export type { CompteRendu } from "./types/compte_rendu";
|
|
9
9
|
export type { Debat, LecAssDeb } from "./types/debats";
|
package/lib/loaders.d.ts
CHANGED
|
@@ -4,7 +4,7 @@ import { DossierLegislatifResult } from "./model/dosleg";
|
|
|
4
4
|
import { QuestionResult } from "./model/questions";
|
|
5
5
|
import { ScrutinResult } from "./model/scrutins";
|
|
6
6
|
import { CirconscriptionResult, OrganismeResult, SenateurResult } from "./model/sens";
|
|
7
|
-
import {
|
|
7
|
+
import { Reunion } from "./types/agenda";
|
|
8
8
|
import { FlatTexte } from "./types/texte";
|
|
9
9
|
import { CompteRendu } from "./types/compte_rendu";
|
|
10
10
|
export { EnabledDatasets } from "./datasets";
|
|
@@ -49,6 +49,7 @@ export interface DossierLegislatifDocumentResult {
|
|
|
49
49
|
type_lecture: string;
|
|
50
50
|
libelle_lecture: string;
|
|
51
51
|
libelle_organisme: string | null;
|
|
52
|
+
code_organisme: string | null;
|
|
52
53
|
numero: number | null;
|
|
53
54
|
id: string | null;
|
|
54
55
|
url: string;
|
|
@@ -86,7 +87,7 @@ export declare function loadSenatTexteContent(dataDir: string, textePathFromData
|
|
|
86
87
|
export declare function loadSenatCompteRenduContent(dataDir: string, session: number, debatId: string | number): {
|
|
87
88
|
item: CompteRendu | null;
|
|
88
89
|
};
|
|
89
|
-
export declare function iterLoadSenatAgendas(dataDir: string, session: number | undefined): Generator<IterItem<
|
|
90
|
+
export declare function iterLoadSenatAgendas(dataDir: string, session: number | undefined): Generator<IterItem<Reunion>>;
|
|
90
91
|
export declare function iterLoadSenatCirconscriptions(dataDir: string, options?: {}): Generator<IterItem<CirconscriptionResult>>;
|
|
91
92
|
export declare function iterLoadSenatOrganismes(dataDir: string, options?: {}): Generator<IterItem<OrganismeResult>>;
|
|
92
93
|
export declare function iterLoadSenatSenateurs(dataDir: string, options?: {}): Generator<IterItem<SenateurResult>>;
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import * as cheerio from "cheerio";
|
|
2
2
|
import { CompteRendu } from "../types/compte_rendu";
|
|
3
|
-
import {
|
|
3
|
+
import { Reunion } from "../types/agenda";
|
|
4
4
|
export declare function getRemainingTextAfterSpeakerHeader($: cheerio.CheerioAPI, $p: cheerio.Cheerio<any>): string;
|
|
5
5
|
export type DaySection = {
|
|
6
6
|
title: string;
|
|
@@ -14,5 +14,5 @@ export declare function parseCommissionCRSectionFromDom($: cheerio.CheerioAPI, h
|
|
|
14
14
|
hourShort: string | null;
|
|
15
15
|
organe?: string | null;
|
|
16
16
|
section: DaySection;
|
|
17
|
-
matched?:
|
|
17
|
+
matched?: Reunion;
|
|
18
18
|
}): CompteRendu | null;
|
package/lib/model/commission.js
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import * as cheerio from "cheerio";
|
|
2
2
|
import path from "path";
|
|
3
|
-
import {
|
|
4
|
-
import {
|
|
5
|
-
import { frDateToISO, hourShortToStartTime } from "../utils/
|
|
3
|
+
import { makeReunionUid } from "../utils/reunion_parsing";
|
|
4
|
+
import { norm } from "../utils/string_cleaning";
|
|
5
|
+
import { frDateToISO, hourShortToStartTime } from "../utils/date";
|
|
6
|
+
import { toCRDate } from "./util";
|
|
6
7
|
const PARA_h3_SEL = "p.sh_justify, p.sh_center, p.sh_marge, p[align], li, h3";
|
|
7
8
|
function findDayRoot($, targetISO) {
|
|
8
9
|
let $root = $();
|
|
@@ -208,7 +209,7 @@ export function extractDayH3Sections($, dateISO) {
|
|
|
208
209
|
export function parseCommissionCRSectionFromDom($, htmlFilePath, opts) {
|
|
209
210
|
try {
|
|
210
211
|
const { dateISO, hourShort, organe, section, matched } = opts;
|
|
211
|
-
const seanceRef = matched?.uid ??
|
|
212
|
+
const seanceRef = matched?.uid ?? makeReunionUid(dateISO, "COM", matched?.events[0].id ?? hourShort ?? "", organe ?? undefined);
|
|
212
213
|
const uid = seanceRef.replace(/^RU/, "CRC");
|
|
213
214
|
const dateSeance = toCRDate(dateISO, matched?.startTime ?? hourShortToStartTime(hourShort));
|
|
214
215
|
const $dayRoot = findDayRoot($, dateISO);
|
package/lib/model/seance.d.ts
CHANGED
|
@@ -1,9 +1,3 @@
|
|
|
1
|
-
import { CompteRendu
|
|
2
|
-
|
|
3
|
-
export declare function parseCompteRenduSlotFromFile(xmlFilePath: string, wantedSlot: TimeSlot, firstSlotOfDay?: TimeSlot): Promise<CompteRendu | null>;
|
|
1
|
+
import { CompteRendu } from "../types/compte_rendu";
|
|
2
|
+
export declare function parseCompteRenduIntervalFromFile(xmlFilePath: string, startIndex: number, endIndex: number, agendaEventId: string): Promise<CompteRendu | null>;
|
|
4
3
|
export declare function sessionStartYearFromDate(d: Date): number;
|
|
5
|
-
export declare function parseYYYYMMDD(yyyymmdd: string): Date | null;
|
|
6
|
-
export declare function deriveTitreObjetFromSommaire(sommaire: Sommaire | undefined, slot: TimeSlot): {
|
|
7
|
-
titre: string;
|
|
8
|
-
objet: string;
|
|
9
|
-
};
|
package/lib/model/seance.js
CHANGED
|
@@ -1,33 +1,29 @@
|
|
|
1
1
|
import fs from "fs";
|
|
2
2
|
import * as cheerio from "cheerio";
|
|
3
|
-
import
|
|
4
|
-
import {
|
|
5
|
-
import {
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
export async function parseCompteRenduSlotFromFile(xmlFilePath, wantedSlot, firstSlotOfDay) {
|
|
3
|
+
import { toCRDate } from "./util";
|
|
4
|
+
import { makeReunionUid } from "../utils/reunion_parsing";
|
|
5
|
+
import { yyyymmddFromPath } from "../utils/date";
|
|
6
|
+
import { decodeHtmlEntities, dedupeSpeaker, fixApostrophes, norm } from "../utils/string_cleaning";
|
|
7
|
+
export async function parseCompteRenduIntervalFromFile(xmlFilePath, startIndex, endIndex, agendaEventId) {
|
|
9
8
|
try {
|
|
10
9
|
const raw = fs.readFileSync(xmlFilePath, "utf8");
|
|
11
10
|
const $ = cheerio.load(raw, { xml: false });
|
|
12
11
|
const metadonnees = extractMetadonnees($, xmlFilePath);
|
|
13
12
|
const order = $("body *").toArray();
|
|
14
13
|
const idx = new Map(order.map((el, i) => [el, i]));
|
|
15
|
-
const
|
|
16
|
-
const
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
14
|
+
const totalNodes = order.length;
|
|
15
|
+
const clampedStart = Math.max(0, Math.min(startIndex, totalNodes - 1));
|
|
16
|
+
const clampedEnd = Math.max(0, Math.min(endIndex, totalNodes - 1));
|
|
17
|
+
const intervals = [
|
|
18
|
+
{
|
|
19
|
+
start: clampedStart,
|
|
20
|
+
end: clampedEnd,
|
|
21
|
+
},
|
|
22
|
+
];
|
|
21
23
|
metadonnees.sommaire = extractSommaireForIntervals($, idx, intervals);
|
|
22
24
|
const points = [];
|
|
23
25
|
let ordre = 0;
|
|
24
26
|
const addPoint = (p) => points.push({ ...p, ordre_absolu_seance: String(++ordre) });
|
|
25
|
-
// Titles removes because they are just listed at the top of the file and not linked to any ancre
|
|
26
|
-
// $("cri\\:titreS1 p.titre_S1").each((_, el) => {
|
|
27
|
-
// if (!elementInAnyInterval(el, idx, intervals)) return
|
|
28
|
-
// const t = normalizeTitle(norm($(el).text() || ""))
|
|
29
|
-
// if (t) addPoint({ code_grammaire: "TITRE_TEXTE_DISCUSSION", texte: { _: t }, code_style: "Titre" })
|
|
30
|
-
// })
|
|
31
27
|
// Interventions
|
|
32
28
|
$("div.intervenant").each((_, block) => {
|
|
33
29
|
if (!elementInAnyInterval(block, idx, intervals))
|
|
@@ -47,6 +43,8 @@ export async function parseCompteRenduSlotFromFile(xmlFilePath, wantedSlot, firs
|
|
|
47
43
|
].join(","))
|
|
48
44
|
.remove();
|
|
49
45
|
const firstP = $block.find("p").first();
|
|
46
|
+
if (!firstP || firstP.length === 0)
|
|
47
|
+
return;
|
|
50
48
|
const speakerLabelRaw = firstP.find(".orateur_nom").text() || firstP.find("a.lien_senfic").text() || "";
|
|
51
49
|
const speakerLabel = dedupeSpeaker(speakerLabelRaw);
|
|
52
50
|
const { mat, nom: nomCRI, qua: quaCRI } = readIntervenantMeta($block);
|
|
@@ -55,7 +53,8 @@ export async function parseCompteRenduSlotFromFile(xmlFilePath, wantedSlot, firs
|
|
|
55
53
|
const canonicalName = dedupeSpeaker(nomCRI || speakerLabel);
|
|
56
54
|
const role = roleForSpeaker(speakerLabel) || roleForSpeaker(qualite) || roleForSpeaker(quaCRI || "");
|
|
57
55
|
const speechHtml = sanitizeInterventionHtml($, $block);
|
|
58
|
-
|
|
56
|
+
const speechText = norm(cheerio.load(speechHtml).text() || "");
|
|
57
|
+
if (!speechText)
|
|
59
58
|
return;
|
|
60
59
|
addPoint({
|
|
61
60
|
code_grammaire: "PAROLE_GENERIQUE",
|
|
@@ -65,19 +64,25 @@ export async function parseCompteRenduSlotFromFile(xmlFilePath, wantedSlot, firs
|
|
|
65
64
|
});
|
|
66
65
|
});
|
|
67
66
|
const contenu = {
|
|
68
|
-
quantiemes: {
|
|
67
|
+
quantiemes: {
|
|
68
|
+
journee: metadonnees.dateSeance,
|
|
69
|
+
session: metadonnees.session,
|
|
70
|
+
},
|
|
69
71
|
point: points,
|
|
70
72
|
};
|
|
73
|
+
const yyyymmdd = yyyymmddFromPath(xmlFilePath);
|
|
74
|
+
const dateISO = `${yyyymmdd.slice(0, 4)}-${yyyymmdd.slice(4, 6)}-${yyyymmdd.slice(6, 8)}`;
|
|
75
|
+
const seanceRef = makeReunionUid(dateISO, "SP", agendaEventId, null);
|
|
71
76
|
return {
|
|
72
|
-
uid:
|
|
73
|
-
seanceRef
|
|
77
|
+
uid: `CRSSN${yyyymmdd}E${agendaEventId}`,
|
|
78
|
+
seanceRef,
|
|
74
79
|
sessionRef: metadonnees.session,
|
|
75
80
|
metadonnees,
|
|
76
81
|
contenu,
|
|
77
82
|
};
|
|
78
83
|
}
|
|
79
84
|
catch (e) {
|
|
80
|
-
console.error(`[CRI]
|
|
85
|
+
console.error(`[CRI] parseInterval error file=${xmlFilePath} interval=[${startIndex}..${endIndex}] event=${agendaEventId}:`, e);
|
|
81
86
|
return null;
|
|
82
87
|
}
|
|
83
88
|
}
|
|
@@ -87,96 +92,6 @@ export function sessionStartYearFromDate(d) {
|
|
|
87
92
|
const y = d.getFullYear();
|
|
88
93
|
return m >= 9 ? y : y - 1;
|
|
89
94
|
}
|
|
90
|
-
export function parseYYYYMMDD(yyyymmdd) {
|
|
91
|
-
if (!/^\d{8}$/.test(yyyymmdd))
|
|
92
|
-
return null;
|
|
93
|
-
const y = Number(yyyymmdd.slice(0, 4));
|
|
94
|
-
const m = Number(yyyymmdd.slice(4, 6)) - 1;
|
|
95
|
-
const d = Number(yyyymmdd.slice(6, 8));
|
|
96
|
-
const dt = new Date(y, m, d);
|
|
97
|
-
return Number.isFinite(dt.getTime()) ? dt : null;
|
|
98
|
-
}
|
|
99
|
-
export function deriveTitreObjetFromSommaire(sommaire, slot) {
|
|
100
|
-
const items = extractLevel1Items(sommaire);
|
|
101
|
-
const meaningful = items.filter((it) => !isBoilerplate(it.label));
|
|
102
|
-
if (meaningful.length === 0) {
|
|
103
|
-
return {
|
|
104
|
-
titre: `Séance publique ${slotLabel(slot)}`,
|
|
105
|
-
objet: "",
|
|
106
|
-
};
|
|
107
|
-
}
|
|
108
|
-
const titre = meaningful[0].label;
|
|
109
|
-
const objet = meaningful
|
|
110
|
-
.slice(0, 3)
|
|
111
|
-
.map((it) => it.label)
|
|
112
|
-
.join(" ; ");
|
|
113
|
-
return { titre, objet };
|
|
114
|
-
}
|
|
115
|
-
function slotLabel(slot) {
|
|
116
|
-
switch (slot) {
|
|
117
|
-
case "MATIN":
|
|
118
|
-
return "du matin";
|
|
119
|
-
case "APRES-MIDI":
|
|
120
|
-
return "de l’après-midi";
|
|
121
|
-
case "SOIR":
|
|
122
|
-
return "du soir";
|
|
123
|
-
default:
|
|
124
|
-
return "";
|
|
125
|
-
}
|
|
126
|
-
}
|
|
127
|
-
const BOILERPLATE_PATTERNS = [
|
|
128
|
-
/proc(?:è|e)s-?verbal/i,
|
|
129
|
-
/hommages?/i,
|
|
130
|
-
/désignation des vice-?président/i,
|
|
131
|
-
/candidatures? aux?/i,
|
|
132
|
-
/ordre du jour/i,
|
|
133
|
-
/rappels? au règlement/i,
|
|
134
|
-
/communications?/i,
|
|
135
|
-
/dépôts?/i,
|
|
136
|
-
/proclamation/i,
|
|
137
|
-
/présidence de/i,
|
|
138
|
-
/questions? diverses?/i,
|
|
139
|
-
/ouverture de la séance/i,
|
|
140
|
-
/clo(?:t|̂)ure de la séance/i,
|
|
141
|
-
];
|
|
142
|
-
const isBoilerplate = (label) => !label?.trim() || BOILERPLATE_PATTERNS.some((rx) => rx.test(label));
|
|
143
|
-
function extractLevel1Items(sommaire) {
|
|
144
|
-
const level1 = asArray(sommaire?.sommaire1);
|
|
145
|
-
return level1
|
|
146
|
-
.map((el) => ({
|
|
147
|
-
numero: toInt(el?.valeur_pts_odj),
|
|
148
|
-
label: String(el?.titreStruct?.intitule ?? "").trim(),
|
|
149
|
-
}))
|
|
150
|
-
.filter((it) => !!it.label)
|
|
151
|
-
.sort((a, b) => a.numero - b.numero);
|
|
152
|
-
}
|
|
153
|
-
function stripTrailingPunct(s) {
|
|
154
|
-
return s.replace(/\s*([:,.;])\s*$/u, "").trim();
|
|
155
|
-
}
|
|
156
|
-
function dedupeSpeaker(raw) {
|
|
157
|
-
let s = norm(raw);
|
|
158
|
-
s = stripTrailingPunct(s);
|
|
159
|
-
const dupPatterns = [/^(.+?)\s*[.]\s*\1$/u, /^(.+?)\s*,\s*\1,?$/u, /^(.+?)\s+\1$/u];
|
|
160
|
-
for (const re of dupPatterns) {
|
|
161
|
-
const m = s.match(re);
|
|
162
|
-
if (m) {
|
|
163
|
-
s = m[1];
|
|
164
|
-
break;
|
|
165
|
-
}
|
|
166
|
-
}
|
|
167
|
-
return s.replace(/\.\s*$/, "");
|
|
168
|
-
}
|
|
169
|
-
function fixApostrophes(s) {
|
|
170
|
-
let out = s;
|
|
171
|
-
out = out.replace(/\s*’\s*/g, "’");
|
|
172
|
-
out = out.replace(/\b([dljctmsn])\s*’/gi, (_, m) => m + "’");
|
|
173
|
-
out = out.replace(/’\s+([A-Za-zÀ-ÖØ-öø-ÿ])/g, "’$1");
|
|
174
|
-
out = out.replace(/\s+([,;:.!?])/g, "$1");
|
|
175
|
-
return out;
|
|
176
|
-
}
|
|
177
|
-
function normalizeTitle(text) {
|
|
178
|
-
return text.replace(/^PR[ÉE]SIDENCE DE\b/i, "Présidence de ");
|
|
179
|
-
}
|
|
180
95
|
function roleForSpeaker(labelOrQualite) {
|
|
181
96
|
const s = (labelOrQualite || "").toLowerCase();
|
|
182
97
|
if (/^(m\.|mme)?\s*(le|la)\s+pr[ée]sident(e)?\b/.test(s) || /\bpr[ée]sident[e]?\s+de\s+séance\b/.test(s))
|
package/lib/model/util.d.ts
CHANGED
|
@@ -6,8 +6,4 @@ export declare function removeSubstring(expr: Expression<string | null | undefin
|
|
|
6
6
|
export declare function replace(expr: Expression<string | null | undefined>, pattern: Expression<string>, replacement: Expression<string>): import("kysely").RawBuilder<string>;
|
|
7
7
|
export declare function rtrim(expr: Expression<string | null | undefined>): import("kysely").RawBuilder<string>;
|
|
8
8
|
export declare function toDateString(expr: Expression<Date | null | undefined>, format?: Expression<string>): import("kysely").RawBuilder<string>;
|
|
9
|
-
export declare function norm(s?: string | null): string;
|
|
10
9
|
export declare function toCRDate(dateISO: string, startTime?: string | null): string;
|
|
11
|
-
export declare function normalizeTitle(t: string): string;
|
|
12
|
-
export declare function jaccardTokenSim(a: string, b: string): number;
|
|
13
|
-
export declare function decodeHtmlEntities(s?: string | null): string;
|
package/lib/model/util.js
CHANGED
|
@@ -21,12 +21,6 @@ export function rtrim(expr) {
|
|
|
21
21
|
export function toDateString(expr, format = sql.val(STANDARD_DATE_FORMAT)) {
|
|
22
22
|
return sql `to_char(${expr}, ${format})`;
|
|
23
23
|
}
|
|
24
|
-
export function norm(s) {
|
|
25
|
-
return (s || "")
|
|
26
|
-
.replace(/\u00A0/g, " ")
|
|
27
|
-
.replace(/\s+/g, " ")
|
|
28
|
-
.trim();
|
|
29
|
-
}
|
|
30
24
|
export function toCRDate(dateISO, startTime) {
|
|
31
25
|
const yyyymmdd = dateISO.replace(/-/g, ""); // "20250716"
|
|
32
26
|
let hh = "00", mm = "00", ss = "00", SSS = "000";
|
|
@@ -42,35 +36,3 @@ export function toCRDate(dateISO, startTime) {
|
|
|
42
36
|
}
|
|
43
37
|
return `${yyyymmdd}${hh}${mm}${ss}${SSS}`;
|
|
44
38
|
}
|
|
45
|
-
export function normalizeTitle(t) {
|
|
46
|
-
return (t || "")
|
|
47
|
-
.toLowerCase()
|
|
48
|
-
.normalize("NFD")
|
|
49
|
-
.replace(/\p{Diacritic}/gu, "")
|
|
50
|
-
.replace(/[^a-z0-9\s]/g, " ")
|
|
51
|
-
.replace(/\s+/g, " ")
|
|
52
|
-
.trim();
|
|
53
|
-
}
|
|
54
|
-
export function jaccardTokenSim(a, b) {
|
|
55
|
-
const A = new Set(normalizeTitle(a).split(" ").filter(Boolean));
|
|
56
|
-
const B = new Set(normalizeTitle(b).split(" ").filter(Boolean));
|
|
57
|
-
if (A.size === 0 || B.size === 0)
|
|
58
|
-
return 0;
|
|
59
|
-
let inter = 0;
|
|
60
|
-
for (const x of A)
|
|
61
|
-
if (B.has(x))
|
|
62
|
-
inter++;
|
|
63
|
-
return inter / (A.size + B.size - inter);
|
|
64
|
-
}
|
|
65
|
-
export function decodeHtmlEntities(s) {
|
|
66
|
-
if (!s)
|
|
67
|
-
return "";
|
|
68
|
-
return s
|
|
69
|
-
.replace(/&#x([0-9a-fA-F]+);/g, (_, h) => String.fromCodePoint(parseInt(h, 16)))
|
|
70
|
-
.replace(/&#(\d+);/g, (_, d) => String.fromCodePoint(parseInt(d, 10)))
|
|
71
|
-
.replace(/&/g, "&")
|
|
72
|
-
.replace(/</g, "<")
|
|
73
|
-
.replace(/>/g, ">")
|
|
74
|
-
.replace(/"/g, '"')
|
|
75
|
-
.replace(/'/g, "'");
|
|
76
|
-
}
|