@tricoteuses/senat 2.18.12 → 2.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.md +22 -22
- package/README.md +123 -123
- package/lib/model/ameli.js +1 -2
- package/lib/scripts/retrieve_documents.js +1 -2
- package/lib/scripts/retrieve_open_data.js +76 -75
- package/lib/scripts/retrieve_videos.js +3 -3
- package/package.json +101 -101
package/LICENSE.md
CHANGED
|
@@ -1,22 +1,22 @@
|
|
|
1
|
-
# Tricoteuses-Senat
|
|
2
|
-
|
|
3
|
-
## _Handle French Sénat's open data_
|
|
4
|
-
|
|
5
|
-
By: Emmanuel Raviart <mailto:emmanuel@raviart.com>
|
|
6
|
-
|
|
7
|
-
Copyright (C) 2019, 2020, 2021 Emmanuel Raviart
|
|
8
|
-
|
|
9
|
-
https://git.tricoteuses.fr/logiciels/tricoteuses-senat
|
|
10
|
-
|
|
11
|
-
> Tricoteuses-Senat is free software; you can redistribute it and/or modify
|
|
12
|
-
> it under the terms of the GNU Affero General Public License as
|
|
13
|
-
> published by the Free Software Foundation, either version 3 of the
|
|
14
|
-
> License, or (at your option) any later version.
|
|
15
|
-
>
|
|
16
|
-
> Tricoteuses-Senat is distributed in the hope that it will be useful,
|
|
17
|
-
> but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
18
|
-
> MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
19
|
-
> GNU Affero General Public License for more details.
|
|
20
|
-
>
|
|
21
|
-
> You should have received a copy of the GNU Affero General Public License
|
|
22
|
-
> along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
1
|
+
# Tricoteuses-Senat
|
|
2
|
+
|
|
3
|
+
## _Handle French Sénat's open data_
|
|
4
|
+
|
|
5
|
+
By: Emmanuel Raviart <mailto:emmanuel@raviart.com>
|
|
6
|
+
|
|
7
|
+
Copyright (C) 2019, 2020, 2021 Emmanuel Raviart
|
|
8
|
+
|
|
9
|
+
https://git.tricoteuses.fr/logiciels/tricoteuses-senat
|
|
10
|
+
|
|
11
|
+
> Tricoteuses-Senat is free software; you can redistribute it and/or modify
|
|
12
|
+
> it under the terms of the GNU Affero General Public License as
|
|
13
|
+
> published by the Free Software Foundation, either version 3 of the
|
|
14
|
+
> License, or (at your option) any later version.
|
|
15
|
+
>
|
|
16
|
+
> Tricoteuses-Senat is distributed in the hope that it will be useful,
|
|
17
|
+
> but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
18
|
+
> MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
19
|
+
> GNU Affero General Public License for more details.
|
|
20
|
+
>
|
|
21
|
+
> You should have received a copy of the GNU Affero General Public License
|
|
22
|
+
> along with this program. If not, see <http://www.gnu.org/licenses/>.
|
package/README.md
CHANGED
|
@@ -1,123 +1,123 @@
|
|
|
1
|
-
# Tricoteuses-Senat
|
|
2
|
-
|
|
3
|
-
## _Retrieve, clean up & handle French Sénat's open data_
|
|
4
|
-
|
|
5
|
-
## Requirements
|
|
6
|
-
|
|
7
|
-
- Node >= 22
|
|
8
|
-
|
|
9
|
-
## Installation
|
|
10
|
-
|
|
11
|
-
```bash
|
|
12
|
-
git clone https://git.tricoteuses.fr/logiciels/tricoteuses-senat
|
|
13
|
-
cd tricoteuses-senat/
|
|
14
|
-
```
|
|
15
|
-
|
|
16
|
-
Create a `.env` file to set PostgreSQL database informations and other configuration variables (you can use `example.env` as a template). Then
|
|
17
|
-
|
|
18
|
-
```bash
|
|
19
|
-
npm install
|
|
20
|
-
```
|
|
21
|
-
|
|
22
|
-
### Database creation (not needed if downloading with Docker image)
|
|
23
|
-
|
|
24
|
-
#### Using Docker
|
|
25
|
-
|
|
26
|
-
```bash
|
|
27
|
-
docker run --name local-postgres -d -p 5432:5432 -e POSTGRES_PASSWORD=$YOUR_CUSTOM_DB_PASSWORD postgres
|
|
28
|
-
# Default Postgres user is postgres
|
|
29
|
-
# But scripts require an "opendata" role
|
|
30
|
-
docker exec -it local-postgres psql -U postgres -c "CREATE ROLE opendata;"
|
|
31
|
-
```
|
|
32
|
-
|
|
33
|
-
## Download data
|
|
34
|
-
|
|
35
|
-
Create a folder where the data will be downloaded and run the following command to download the data and convert it into JSON files.
|
|
36
|
-
|
|
37
|
-
```bash
|
|
38
|
-
mkdir ../senat-data/
|
|
39
|
-
|
|
40
|
-
# Available options for optional `categories` parameter : All, Ameli, Debats, DosLeg, Questions, Sens
|
|
41
|
-
npm run data:download ../senat-data -- [--categories All]
|
|
42
|
-
```
|
|
43
|
-
|
|
44
|
-
Data from other sources is also available :
|
|
45
|
-
|
|
46
|
-
```bash
|
|
47
|
-
# Retrieval of textes and rapports from Sénat's website
|
|
48
|
-
# Available options for optional `formats` parameter : xml, html, pdf
|
|
49
|
-
# Available options for optional `types` parameter : textes, rapports
|
|
50
|
-
npm run data:retrieve_documents ../senat-data -- --fromSession 2022 [--formats xml pdf] [--types textes]
|
|
51
|
-
|
|
52
|
-
# Retrieval & parsing (textes in xml format only for now)
|
|
53
|
-
npm run data:retrieve_documents ../senat-data -- --fromSession 2022 --parseDocuments
|
|
54
|
-
|
|
55
|
-
# Parsing only
|
|
56
|
-
npm run data:parse_textes_lois ../senat-data
|
|
57
|
-
|
|
58
|
-
# Retrieval (& parsing) of agenda from Sénat's website
|
|
59
|
-
npm run data:retrieve_agenda ../senat-data -- --fromSession 2022 [--parseAgenda]
|
|
60
|
-
|
|
61
|
-
# Retrieval (& parsing) of comptes-rendus de séance from Sénat's data
|
|
62
|
-
npm run data:retrieve_cr_seance ../senat-data -- [--parseDebats]
|
|
63
|
-
|
|
64
|
-
# Retrieval (& parsing) of comptes-rendus de commissions from Sénat's website
|
|
65
|
-
npm run data:retrieve_cr_commission ../senat-data -- [--parseDebats]
|
|
66
|
-
|
|
67
|
-
# Retrieval of sénateurs' pictures from Sénat's website
|
|
68
|
-
npm run data:retrieve_senateurs_photos ../senat-data
|
|
69
|
-
```
|
|
70
|
-
|
|
71
|
-
## Data download using Docker
|
|
72
|
-
|
|
73
|
-
A Docker image that downloads and converts the data all at once is available. Build it locally or run it from the container registry.
|
|
74
|
-
Use the environment variables `FROM_SESSION` and `CATEGORIES` if needed.
|
|
75
|
-
|
|
76
|
-
```bash
|
|
77
|
-
docker run --pull always --name tricoteuses-senat -v ../senat-data:/app/senat-data -d git.tricoteuses.fr/logiciels/tricoteuses-senat:latest
|
|
78
|
-
```
|
|
79
|
-
|
|
80
|
-
Use the environment variable `CATEGORIES` and `FROM_SESSION` if needed.
|
|
81
|
-
|
|
82
|
-
## Using the data
|
|
83
|
-
|
|
84
|
-
Once the data is downloaded, you can use loaders to retrieve it.
|
|
85
|
-
To use loaders in your project, you can install the _@tricoteuses/senat_ package, and import the iterator functions that you need.
|
|
86
|
-
|
|
87
|
-
```bash
|
|
88
|
-
npm install @tricoteuses/senat
|
|
89
|
-
```
|
|
90
|
-
|
|
91
|
-
```js
|
|
92
|
-
import { iterLoadSenatQuestions } from "@tricoteuses/senat/loaders"
|
|
93
|
-
|
|
94
|
-
// Pass data directory and legislature as arguments
|
|
95
|
-
for (const { item: question } of iterLoadSenatQuestions("../senat-data", 17)) {
|
|
96
|
-
console.log(question.id)
|
|
97
|
-
}
|
|
98
|
-
```
|
|
99
|
-
|
|
100
|
-
## Generation of raw types from SQL schema (for contributors only)
|
|
101
|
-
|
|
102
|
-
```bash
|
|
103
|
-
npm run data:generate_schemas ../senat-data
|
|
104
|
-
```
|
|
105
|
-
|
|
106
|
-
## Publishing
|
|
107
|
-
|
|
108
|
-
To publish a new version of this package onto npm, bump the package version and publish.
|
|
109
|
-
|
|
110
|
-
```bash
|
|
111
|
-
# Increment version and create a new Git tag automatically
|
|
112
|
-
npm version patch # +0.0.1 → small fixes
|
|
113
|
-
npm version minor # +0.1.0 → new features
|
|
114
|
-
npm version major # +1.0.0 → breaking changes
|
|
115
|
-
npx tsc
|
|
116
|
-
npm publish
|
|
117
|
-
```
|
|
118
|
-
|
|
119
|
-
The Docker image will be automatically built during a CI Workflow if you push the tag to the remote repository.
|
|
120
|
-
|
|
121
|
-
```bash
|
|
122
|
-
git push --tags
|
|
123
|
-
```
|
|
1
|
+
# Tricoteuses-Senat
|
|
2
|
+
|
|
3
|
+
## _Retrieve, clean up & handle French Sénat's open data_
|
|
4
|
+
|
|
5
|
+
## Requirements
|
|
6
|
+
|
|
7
|
+
- Node >= 22
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
git clone https://git.tricoteuses.fr/logiciels/tricoteuses-senat
|
|
13
|
+
cd tricoteuses-senat/
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
Create a `.env` file to set PostgreSQL database informations and other configuration variables (you can use `example.env` as a template). Then
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
npm install
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
### Database creation (not needed if downloading with Docker image)
|
|
23
|
+
|
|
24
|
+
#### Using Docker
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
docker run --name local-postgres -d -p 5432:5432 -e POSTGRES_PASSWORD=$YOUR_CUSTOM_DB_PASSWORD postgres
|
|
28
|
+
# Default Postgres user is postgres
|
|
29
|
+
# But scripts require an "opendata" role
|
|
30
|
+
docker exec -it local-postgres psql -U postgres -c "CREATE ROLE opendata;"
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Download data
|
|
34
|
+
|
|
35
|
+
Create a folder where the data will be downloaded and run the following command to download the data and convert it into JSON files.
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
mkdir ../senat-data/
|
|
39
|
+
|
|
40
|
+
# Available options for optional `categories` parameter : All, Ameli, Debats, DosLeg, Questions, Sens
|
|
41
|
+
npm run data:download ../senat-data -- [--categories All]
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Data from other sources is also available :
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
# Retrieval of textes and rapports from Sénat's website
|
|
48
|
+
# Available options for optional `formats` parameter : xml, html, pdf
|
|
49
|
+
# Available options for optional `types` parameter : textes, rapports
|
|
50
|
+
npm run data:retrieve_documents ../senat-data -- --fromSession 2022 [--formats xml pdf] [--types textes]
|
|
51
|
+
|
|
52
|
+
# Retrieval & parsing (textes in xml format only for now)
|
|
53
|
+
npm run data:retrieve_documents ../senat-data -- --fromSession 2022 --parseDocuments
|
|
54
|
+
|
|
55
|
+
# Parsing only
|
|
56
|
+
npm run data:parse_textes_lois ../senat-data
|
|
57
|
+
|
|
58
|
+
# Retrieval (& parsing) of agenda from Sénat's website
|
|
59
|
+
npm run data:retrieve_agenda ../senat-data -- --fromSession 2022 [--parseAgenda]
|
|
60
|
+
|
|
61
|
+
# Retrieval (& parsing) of comptes-rendus de séance from Sénat's data
|
|
62
|
+
npm run data:retrieve_cr_seance ../senat-data -- [--parseDebats]
|
|
63
|
+
|
|
64
|
+
# Retrieval (& parsing) of comptes-rendus de commissions from Sénat's website
|
|
65
|
+
npm run data:retrieve_cr_commission ../senat-data -- [--parseDebats]
|
|
66
|
+
|
|
67
|
+
# Retrieval of sénateurs' pictures from Sénat's website
|
|
68
|
+
npm run data:retrieve_senateurs_photos ../senat-data
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Data download using Docker
|
|
72
|
+
|
|
73
|
+
A Docker image that downloads and converts the data all at once is available. Build it locally or run it from the container registry.
|
|
74
|
+
Use the environment variables `FROM_SESSION` and `CATEGORIES` if needed.
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
docker run --pull always --name tricoteuses-senat -v ../senat-data:/app/senat-data -d git.tricoteuses.fr/logiciels/tricoteuses-senat:latest
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Use the environment variable `CATEGORIES` and `FROM_SESSION` if needed.
|
|
81
|
+
|
|
82
|
+
## Using the data
|
|
83
|
+
|
|
84
|
+
Once the data is downloaded, you can use loaders to retrieve it.
|
|
85
|
+
To use loaders in your project, you can install the _@tricoteuses/senat_ package, and import the iterator functions that you need.
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
npm install @tricoteuses/senat
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
```js
|
|
92
|
+
import { iterLoadSenatQuestions } from "@tricoteuses/senat/loaders"
|
|
93
|
+
|
|
94
|
+
// Pass data directory and legislature as arguments
|
|
95
|
+
for (const { item: question } of iterLoadSenatQuestions("../senat-data", 17)) {
|
|
96
|
+
console.log(question.id)
|
|
97
|
+
}
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## Generation of raw types from SQL schema (for contributors only)
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
npm run data:generate_schemas ../senat-data
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## Publishing
|
|
107
|
+
|
|
108
|
+
To publish a new version of this package onto npm, bump the package version and publish.
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
# Increment version and create a new Git tag automatically
|
|
112
|
+
npm version patch # +0.0.1 → small fixes
|
|
113
|
+
npm version minor # +0.1.0 → new features
|
|
114
|
+
npm version major # +1.0.0 → breaking changes
|
|
115
|
+
npx tsc
|
|
116
|
+
npm publish
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
The Docker image will be automatically built during a CI Workflow if you push the tag to the remote repository.
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
git push --tags
|
|
123
|
+
```
|
package/lib/model/ameli.js
CHANGED
|
@@ -116,8 +116,7 @@ const findAllAmendementsQuery = dbSenat
|
|
|
116
116
|
"ameli.com_ameli.lil as au_nom_de_commission",
|
|
117
117
|
eb.case().when("ameli.cab.entid", "is not", null).then(true).else(false).end().as("auteur_est_gouvernement"),
|
|
118
118
|
auteurs(ref("ameli.amd.id")).as("auteurs"),
|
|
119
|
-
])
|
|
120
|
-
.distinctOn("ameli.amd.id");
|
|
119
|
+
]);
|
|
121
120
|
export function findAllAmendements(fromSession) {
|
|
122
121
|
if (fromSession !== undefined) {
|
|
123
122
|
return findAllAmendementsQuery.where("ameli.ses.ann", ">=", fromSession).stream();
|
|
@@ -6,11 +6,10 @@ import path from "path";
|
|
|
6
6
|
import { DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER, iterLoadSenatDossiersLegislatifsRapportUrls, iterLoadSenatDossiersLegislatifsTexteUrls, RAPPORT_FOLDER, TEXTE_FOLDER, } from "../loaders";
|
|
7
7
|
import { parseExposeDesMotifs, parseTexte, parseTexteFromFile } from "../model/texte";
|
|
8
8
|
import { getSessionsFromStart, UNDEFINED_SESSION } from "../types/sessions";
|
|
9
|
-
import { commonOptions
|
|
9
|
+
import { commonOptions } from "./shared/cli_helpers";
|
|
10
10
|
import { ensureAndClearDir, fetchWithRetry, isOptionEmptyOrHasValue } from "./shared/util";
|
|
11
11
|
const optionsDefinitions = [
|
|
12
12
|
...commonOptions,
|
|
13
|
-
onlyRecentOption,
|
|
14
13
|
{
|
|
15
14
|
help: "parse and convert documents into JSON (textes only for now, requires format xml)",
|
|
16
15
|
name: "parseDocuments",
|
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
import assert from "assert";
|
|
2
|
-
import { execSync
|
|
2
|
+
import { execSync } from "child_process";
|
|
3
3
|
import commandLineArgs from "command-line-args";
|
|
4
4
|
import fs from "fs-extra";
|
|
5
|
+
// import fetch from "node-fetch"
|
|
5
6
|
import path from "path";
|
|
6
7
|
import StreamZip from "node-stream-zip";
|
|
8
|
+
import readline from "readline";
|
|
7
9
|
import windows1252 from "windows-1252";
|
|
8
|
-
import { pipeline
|
|
10
|
+
import { pipeline } from "stream";
|
|
9
11
|
import { promisify } from "util";
|
|
10
12
|
import config from "../config";
|
|
11
13
|
import { getChosenDatasets, getEnabledDatasets } from "../datasets";
|
|
@@ -67,88 +69,60 @@ async function downloadFile(url, dest) {
|
|
|
67
69
|
}
|
|
68
70
|
/**
|
|
69
71
|
* Copy a dataset database to the main Senat database (overwriting its contents).
|
|
70
|
-
* Optimized to combine encoding repair and schema transformation in a single pass.
|
|
71
72
|
*/
|
|
72
73
|
async function copyToSenat(dataset, dataDir, options) {
|
|
73
74
|
if (!options["silent"]) {
|
|
74
75
|
console.log(`Copying ${dataset.database} to Senat database...`);
|
|
75
76
|
}
|
|
76
77
|
const sqlFilePath = path.join(dataDir, `${dataset.database}.sql`);
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
else if (!inString) {
|
|
86
|
-
parts[i] = parts[i].replace(/\bpublic\b(?=(\s*\.|\s*[,;]|\s|$))/g, schema);
|
|
87
|
-
}
|
|
88
|
-
}
|
|
89
|
-
return parts.join('');
|
|
90
|
-
}
|
|
91
|
-
// Spawn psql process
|
|
92
|
-
const psqlArgs = options["sudo"]
|
|
93
|
-
? ["-u", options["sudo"], "psql", "--quiet", "-d", "senat"]
|
|
94
|
-
: ["--quiet", "-d", "senat"];
|
|
95
|
-
const psql = spawn(options["sudo"] ? "sudo" : "psql", psqlArgs, {
|
|
96
|
-
stdio: ["pipe", "ignore", "pipe"],
|
|
97
|
-
env: process.env,
|
|
78
|
+
const schemaDumpFile = path.join(dataDir, `${dataset.database}_schema_dump.sql`);
|
|
79
|
+
// Write the header and then stream the rest of the SQL file
|
|
80
|
+
const schemaSqlWriter = fs.createWriteStream(schemaDumpFile, { encoding: "utf8" });
|
|
81
|
+
// Add CREATE SCHEMA statement at the top
|
|
82
|
+
schemaSqlWriter.write(`CREATE SCHEMA IF NOT EXISTS ${dataset.database};\n`);
|
|
83
|
+
const lineReader = readline.createInterface({
|
|
84
|
+
input: fs.createReadStream(sqlFilePath, { encoding: "utf8" }),
|
|
85
|
+
crlfDelay: Infinity,
|
|
98
86
|
});
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
let
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
let newLine = replacePublicOutsideStrings(line, dataset.database);
|
|
114
|
-
newLine = newLine.replace(/SET client_encoding = 'LATIN1';/i, "SET client_encoding = 'UTF8';");
|
|
115
|
-
processedData += newLine + '\n';
|
|
116
|
-
}
|
|
117
|
-
callback(null, processedData);
|
|
118
|
-
},
|
|
119
|
-
flush(callback) {
|
|
120
|
-
// Process any remaining data in buffer
|
|
121
|
-
if (buffer) {
|
|
122
|
-
let newLine = replacePublicOutsideStrings(buffer, dataset.database);
|
|
123
|
-
newLine = newLine.replace(/SET client_encoding = 'LATIN1';/i, "SET client_encoding = 'UTF8';");
|
|
124
|
-
callback(null, newLine);
|
|
125
|
-
}
|
|
126
|
-
else {
|
|
127
|
-
callback();
|
|
87
|
+
for await (const line of lineReader) {
|
|
88
|
+
let newLine = line;
|
|
89
|
+
// Replace 'public' schema outside single-quoted strings
|
|
90
|
+
function replacePublicOutsideStrings(line, schema) {
|
|
91
|
+
const parts = line.split(/(')/);
|
|
92
|
+
let inString = false;
|
|
93
|
+
for (let i = 0; i < parts.length; i++) {
|
|
94
|
+
if (parts[i] === "'") {
|
|
95
|
+
inString = !inString;
|
|
96
|
+
}
|
|
97
|
+
else if (!inString) {
|
|
98
|
+
// Only replace outside of strings, including before comma
|
|
99
|
+
parts[i] = parts[i].replace(/\bpublic\b(?=(\s*\.|\s*[,;]|\s|$))/g, schema);
|
|
100
|
+
}
|
|
128
101
|
}
|
|
102
|
+
return parts.join('');
|
|
129
103
|
}
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
104
|
+
newLine = replacePublicOutsideStrings(line, dataset.database);
|
|
105
|
+
// Replace SET client_encoding to UTF8
|
|
106
|
+
newLine = newLine.replace(/SET client_encoding = 'LATIN1';/i, "SET client_encoding = 'UTF8';");
|
|
107
|
+
schemaSqlWriter.write(newLine + "\n");
|
|
108
|
+
}
|
|
109
|
+
schemaSqlWriter.end();
|
|
110
|
+
await new Promise((resolve, reject) => {
|
|
111
|
+
schemaSqlWriter.on("finish", () => {
|
|
112
|
+
try {
|
|
113
|
+
execSync(`${options["sudo"] ? `sudo -u ${options["sudo"]} ` : ""}psql --quiet -d senat -f ${schemaDumpFile}`, {
|
|
114
|
+
env: process.env,
|
|
115
|
+
encoding: "utf-8",
|
|
116
|
+
stdio: ["ignore", "ignore", "pipe"],
|
|
117
|
+
});
|
|
144
118
|
}
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
}
|
|
149
|
-
reject(new Error(`psql exited with code ${code}`));
|
|
119
|
+
finally {
|
|
120
|
+
try { }
|
|
121
|
+
catch { }
|
|
150
122
|
}
|
|
123
|
+
resolve();
|
|
151
124
|
});
|
|
125
|
+
schemaSqlWriter.on("error", reject);
|
|
152
126
|
});
|
|
153
127
|
}
|
|
154
128
|
async function retrieveDataset(dataDir, dataset) {
|
|
@@ -202,9 +176,31 @@ async function retrieveDataset(dataDir, dataset) {
|
|
|
202
176
|
dataset.repairZip(dataset, dataDir);
|
|
203
177
|
}
|
|
204
178
|
}
|
|
205
|
-
|
|
206
|
-
|
|
179
|
+
if ((options["all"] || options["repairEncoding"]) && dataset.repairEncoding) {
|
|
180
|
+
if (!options["silent"]) {
|
|
181
|
+
console.log(`Repairing Windows CP1252 encoding in ${dataset.title}: ${sqlFilename}…`);
|
|
182
|
+
}
|
|
183
|
+
const repairedSqlFilePath = sqlFilePath + ".repaired";
|
|
184
|
+
const repairedSqlWriter = fs.createWriteStream(repairedSqlFilePath, {
|
|
185
|
+
encoding: "utf8",
|
|
186
|
+
});
|
|
187
|
+
// Read the file as latin1 (ISO-8859-1/CP1252) and write as UTF-8
|
|
188
|
+
const lineReader = readline.createInterface({
|
|
189
|
+
input: fs.createReadStream(sqlFilePath, { encoding: "latin1" }),
|
|
190
|
+
crlfDelay: Infinity,
|
|
191
|
+
});
|
|
192
|
+
for await (const line of lineReader) {
|
|
193
|
+
// Optionally repair Windows-1252 control characters
|
|
194
|
+
let repairedLine = line.replace(badWindows1252CharacterRegex, (match) => windows1252.decode(match, { mode: "fatal" }));
|
|
195
|
+
repairedSqlWriter.write(repairedLine + "\n");
|
|
196
|
+
}
|
|
197
|
+
repairedSqlWriter.end();
|
|
198
|
+
await fs.move(repairedSqlFilePath, sqlFilePath, { overwrite: true });
|
|
199
|
+
}
|
|
207
200
|
if (options["all"] || options["import"] || options["schema"]) {
|
|
201
|
+
if (!options["silent"]) {
|
|
202
|
+
console.log(`Importing ${dataset.title}: ${sqlFilename}…`);
|
|
203
|
+
}
|
|
208
204
|
await copyToSenat(dataset, dataDir, options);
|
|
209
205
|
// Create indexes programmatically after import
|
|
210
206
|
if (dataset.indexes) {
|
|
@@ -274,7 +270,12 @@ async function retrieveOpenData() {
|
|
|
274
270
|
process.env["PGUSER"] &&
|
|
275
271
|
process.env["PGPASSWORD"], "Missing database configuration: environment variables PGHOST, PGPORT, PGUSER and PGPASSWORD or TRICOTEUSES_SENAT_DB_* in .env file");
|
|
276
272
|
console.time("data extraction time");
|
|
277
|
-
execSync(`${options["sudo"] ? `sudo -u ${options["sudo"]} ` : ""}psql --quiet -c "
|
|
273
|
+
execSync(`${options["sudo"] ? `sudo -u ${options["sudo"]} ` : ""}psql --quiet -c "DROP DATABASE IF EXISTS senat"`, {
|
|
274
|
+
cwd: dataDir,
|
|
275
|
+
env: process.env,
|
|
276
|
+
encoding: "utf-8",
|
|
277
|
+
});
|
|
278
|
+
execSync(`${options["sudo"] ? `sudo -u ${options["sudo"]} ` : ""}psql --quiet -c "CREATE DATABASE senat WITH OWNER opendata"`, {
|
|
278
279
|
cwd: dataDir,
|
|
279
280
|
env: process.env,
|
|
280
281
|
encoding: "utf-8",
|
|
@@ -390,9 +390,9 @@ async function processGroupedReunion(agenda, session, dataDir) {
|
|
|
390
390
|
if (accepted)
|
|
391
391
|
STATS.accepted++;
|
|
392
392
|
if (!options["silent"]) {
|
|
393
|
-
console.log(`[pick] ${agenda.uid} score=${best.score.toFixed(2)}
|
|
394
|
-
agenda title="${agenda.titre ?? ""}" agenda organe="${agenda.organe ?? ""}" agenda heure=${agenda.startTime}
|
|
395
|
-
best title="${best.vtitle ?? ""}" best organe="${best.vorgane ?? ""}"
|
|
393
|
+
console.log(`[pick] ${agenda.uid} score=${best.score.toFixed(2)}
|
|
394
|
+
agenda title="${agenda.titre ?? ""}" agenda organe="${agenda.organe ?? ""}" agenda heure=${agenda.startTime}
|
|
395
|
+
best title="${best.vtitle ?? ""}" best organe="${best.vorgane ?? ""}"
|
|
396
396
|
accepted=${accepted}`);
|
|
397
397
|
}
|
|
398
398
|
// ==== 3) Write metadata + NVS of the best candidate (always) ====
|
package/package.json
CHANGED
|
@@ -1,101 +1,101 @@
|
|
|
1
|
-
{
|
|
2
|
-
"name": "@tricoteuses/senat",
|
|
3
|
-
"version": "2.
|
|
4
|
-
"description": "Handle French Sénat's open data",
|
|
5
|
-
"keywords": [
|
|
6
|
-
"France",
|
|
7
|
-
"open data",
|
|
8
|
-
"Parliament",
|
|
9
|
-
"Sénat"
|
|
10
|
-
],
|
|
11
|
-
"author": "Emmanuel Raviart <emmanuel@raviart.com>",
|
|
12
|
-
"bugs": {
|
|
13
|
-
"url": "https://git.tricoteuses.fr/logiciels/tricoteuses-senat/issues"
|
|
14
|
-
},
|
|
15
|
-
"homepage": "https://tricoteuses.fr/",
|
|
16
|
-
"license": "AGPL-3.0-or-later",
|
|
17
|
-
"repository": {
|
|
18
|
-
"type": "git",
|
|
19
|
-
"url": "https://git.tricoteuses.fr/logiciels/tricoteuses-senat.git"
|
|
20
|
-
},
|
|
21
|
-
"type": "module",
|
|
22
|
-
"engines": {
|
|
23
|
-
"node": ">=22"
|
|
24
|
-
},
|
|
25
|
-
"files": [
|
|
26
|
-
"lib"
|
|
27
|
-
],
|
|
28
|
-
"exports": {
|
|
29
|
-
".": {
|
|
30
|
-
"import": "./lib/index.js",
|
|
31
|
-
"types": "./lib/index.d.ts"
|
|
32
|
-
},
|
|
33
|
-
"./loaders": {
|
|
34
|
-
"import": "./lib/loaders.js",
|
|
35
|
-
"types": "./lib/loaders.d.ts"
|
|
36
|
-
},
|
|
37
|
-
"./package.json": "./package.json"
|
|
38
|
-
},
|
|
39
|
-
"publishConfig": {
|
|
40
|
-
"access": "public"
|
|
41
|
-
},
|
|
42
|
-
"scripts": {
|
|
43
|
-
"build": "tsc",
|
|
44
|
-
"build:types": "tsc --emitDeclarationOnly",
|
|
45
|
-
"data:convert_data": "tsx src/scripts/convert_data.ts",
|
|
46
|
-
"data:download": "tsx src/scripts/data-download.ts",
|
|
47
|
-
"data:generate_schemas": "tsx src/scripts/retrieve_open_data.ts --schema",
|
|
48
|
-
"data:retrieve_agenda": "cross-env TZ='Etc/UTC' tsx src/scripts/retrieve_agenda.ts",
|
|
49
|
-
"data:retrieve_cr_seance": "tsx src/scripts/retrieve_cr_seance.ts",
|
|
50
|
-
"data:retrieve_cr_commission": "tsx src/scripts/retrieve_cr_commission.ts",
|
|
51
|
-
"data:retrieve_documents": "tsx src/scripts/retrieve_documents.ts",
|
|
52
|
-
"data:retrieve_open_data": "tsx src/scripts/retrieve_open_data.ts --all",
|
|
53
|
-
"data:retrieve_senateurs_photos": "tsx src/scripts/retrieve_senateurs_photos.ts --fetch",
|
|
54
|
-
"data:retrieve_videos": "tsx src/scripts/retrieve_videos.ts",
|
|
55
|
-
"data:parse_textes_lois": "tsx src/scripts/parse_textes.ts",
|
|
56
|
-
"prepare": "npm run build",
|
|
57
|
-
"prepublishOnly": "npm run build",
|
|
58
|
-
"prettier": "prettier --write 'src/**/*.ts' 'tests/**/*.test.ts'",
|
|
59
|
-
"test:iter_load": "tsx src/scripts/test_iter_load.ts",
|
|
60
|
-
"type-check": "tsc --noEmit",
|
|
61
|
-
"type-check:watch": "npm run type-check -- --watch"
|
|
62
|
-
},
|
|
63
|
-
"dependencies": {
|
|
64
|
-
"@biryani/core": "^0.2.1",
|
|
65
|
-
"cheerio": "^1.1.2",
|
|
66
|
-
"command-line-args": "^5.1.1",
|
|
67
|
-
"dotenv": "^8.2.0",
|
|
68
|
-
"fs-extra": "^9.1.0",
|
|
69
|
-
"jsdom": "^26.0.0",
|
|
70
|
-
"kysely": "^0.27.4",
|
|
71
|
-
"luxon": "^3.7.2",
|
|
72
|
-
"node-stream-zip": "^1.8.2",
|
|
73
|
-
"pg": "^8.13.1",
|
|
74
|
-
"pg-cursor": "^2.12.1",
|
|
75
|
-
"p-limit": "^7.2.0",
|
|
76
|
-
"slug": "^11.0.0",
|
|
77
|
-
"tsx": "^4.20.6",
|
|
78
|
-
"windows-1252": "^1.0.0"
|
|
79
|
-
},
|
|
80
|
-
"devDependencies": {
|
|
81
|
-
"@typed-code/schemats": "^5.0.1",
|
|
82
|
-
"@types/cheerio": "^1.0.0",
|
|
83
|
-
"@types/command-line-args": "^5.0.0",
|
|
84
|
-
"@types/fs-extra": "^9.0.7",
|
|
85
|
-
"@types/jsdom": "^21.1.7",
|
|
86
|
-
"@types/luxon": "^3.7.1",
|
|
87
|
-
"@types/node": "^20.17.6",
|
|
88
|
-
"@types/pg": "^8.15.5",
|
|
89
|
-
"@types/pg-cursor": "^2.7.2",
|
|
90
|
-
"@types/slug": "^5.0.9",
|
|
91
|
-
"@typescript-eslint/eslint-plugin": "^8.46.0",
|
|
92
|
-
"@typescript-eslint/parser": "^8.46.0",
|
|
93
|
-
"cross-env": "^10.1.0",
|
|
94
|
-
"eslint": "^8.57.1",
|
|
95
|
-
"iconv-lite": "^0.7.0",
|
|
96
|
-
"kysely-codegen": "^0.19.0",
|
|
97
|
-
"prettier": "^3.5.3",
|
|
98
|
-
"tslib": "^2.1.0",
|
|
99
|
-
"typescript": "^5.9.3"
|
|
100
|
-
}
|
|
101
|
-
}
|
|
1
|
+
{
|
|
2
|
+
"name": "@tricoteuses/senat",
|
|
3
|
+
"version": "2.19.0",
|
|
4
|
+
"description": "Handle French Sénat's open data",
|
|
5
|
+
"keywords": [
|
|
6
|
+
"France",
|
|
7
|
+
"open data",
|
|
8
|
+
"Parliament",
|
|
9
|
+
"Sénat"
|
|
10
|
+
],
|
|
11
|
+
"author": "Emmanuel Raviart <emmanuel@raviart.com>",
|
|
12
|
+
"bugs": {
|
|
13
|
+
"url": "https://git.tricoteuses.fr/logiciels/tricoteuses-senat/issues"
|
|
14
|
+
},
|
|
15
|
+
"homepage": "https://tricoteuses.fr/",
|
|
16
|
+
"license": "AGPL-3.0-or-later",
|
|
17
|
+
"repository": {
|
|
18
|
+
"type": "git",
|
|
19
|
+
"url": "https://git.tricoteuses.fr/logiciels/tricoteuses-senat.git"
|
|
20
|
+
},
|
|
21
|
+
"type": "module",
|
|
22
|
+
"engines": {
|
|
23
|
+
"node": ">=22"
|
|
24
|
+
},
|
|
25
|
+
"files": [
|
|
26
|
+
"lib"
|
|
27
|
+
],
|
|
28
|
+
"exports": {
|
|
29
|
+
".": {
|
|
30
|
+
"import": "./lib/index.js",
|
|
31
|
+
"types": "./lib/index.d.ts"
|
|
32
|
+
},
|
|
33
|
+
"./loaders": {
|
|
34
|
+
"import": "./lib/loaders.js",
|
|
35
|
+
"types": "./lib/loaders.d.ts"
|
|
36
|
+
},
|
|
37
|
+
"./package.json": "./package.json"
|
|
38
|
+
},
|
|
39
|
+
"publishConfig": {
|
|
40
|
+
"access": "public"
|
|
41
|
+
},
|
|
42
|
+
"scripts": {
|
|
43
|
+
"build": "tsc",
|
|
44
|
+
"build:types": "tsc --emitDeclarationOnly",
|
|
45
|
+
"data:convert_data": "tsx src/scripts/convert_data.ts",
|
|
46
|
+
"data:download": "tsx src/scripts/data-download.ts",
|
|
47
|
+
"data:generate_schemas": "tsx src/scripts/retrieve_open_data.ts --schema",
|
|
48
|
+
"data:retrieve_agenda": "cross-env TZ='Etc/UTC' tsx src/scripts/retrieve_agenda.ts",
|
|
49
|
+
"data:retrieve_cr_seance": "tsx src/scripts/retrieve_cr_seance.ts",
|
|
50
|
+
"data:retrieve_cr_commission": "tsx src/scripts/retrieve_cr_commission.ts",
|
|
51
|
+
"data:retrieve_documents": "tsx src/scripts/retrieve_documents.ts",
|
|
52
|
+
"data:retrieve_open_data": "tsx src/scripts/retrieve_open_data.ts --all",
|
|
53
|
+
"data:retrieve_senateurs_photos": "tsx src/scripts/retrieve_senateurs_photos.ts --fetch",
|
|
54
|
+
"data:retrieve_videos": "tsx src/scripts/retrieve_videos.ts",
|
|
55
|
+
"data:parse_textes_lois": "tsx src/scripts/parse_textes.ts",
|
|
56
|
+
"prepare": "npm run build",
|
|
57
|
+
"prepublishOnly": "npm run build",
|
|
58
|
+
"prettier": "prettier --write 'src/**/*.ts' 'tests/**/*.test.ts'",
|
|
59
|
+
"test:iter_load": "tsx src/scripts/test_iter_load.ts",
|
|
60
|
+
"type-check": "tsc --noEmit",
|
|
61
|
+
"type-check:watch": "npm run type-check -- --watch"
|
|
62
|
+
},
|
|
63
|
+
"dependencies": {
|
|
64
|
+
"@biryani/core": "^0.2.1",
|
|
65
|
+
"cheerio": "^1.1.2",
|
|
66
|
+
"command-line-args": "^5.1.1",
|
|
67
|
+
"dotenv": "^8.2.0",
|
|
68
|
+
"fs-extra": "^9.1.0",
|
|
69
|
+
"jsdom": "^26.0.0",
|
|
70
|
+
"kysely": "^0.27.4",
|
|
71
|
+
"luxon": "^3.7.2",
|
|
72
|
+
"node-stream-zip": "^1.8.2",
|
|
73
|
+
"pg": "^8.13.1",
|
|
74
|
+
"pg-cursor": "^2.12.1",
|
|
75
|
+
"p-limit": "^7.2.0",
|
|
76
|
+
"slug": "^11.0.0",
|
|
77
|
+
"tsx": "^4.20.6",
|
|
78
|
+
"windows-1252": "^1.0.0"
|
|
79
|
+
},
|
|
80
|
+
"devDependencies": {
|
|
81
|
+
"@typed-code/schemats": "^5.0.1",
|
|
82
|
+
"@types/cheerio": "^1.0.0",
|
|
83
|
+
"@types/command-line-args": "^5.0.0",
|
|
84
|
+
"@types/fs-extra": "^9.0.7",
|
|
85
|
+
"@types/jsdom": "^21.1.7",
|
|
86
|
+
"@types/luxon": "^3.7.1",
|
|
87
|
+
"@types/node": "^20.17.6",
|
|
88
|
+
"@types/pg": "^8.15.5",
|
|
89
|
+
"@types/pg-cursor": "^2.7.2",
|
|
90
|
+
"@types/slug": "^5.0.9",
|
|
91
|
+
"@typescript-eslint/eslint-plugin": "^8.46.0",
|
|
92
|
+
"@typescript-eslint/parser": "^8.46.0",
|
|
93
|
+
"cross-env": "^10.1.0",
|
|
94
|
+
"eslint": "^8.57.1",
|
|
95
|
+
"iconv-lite": "^0.7.0",
|
|
96
|
+
"kysely-codegen": "^0.19.0",
|
|
97
|
+
"prettier": "^3.5.3",
|
|
98
|
+
"tslib": "^2.1.0",
|
|
99
|
+
"typescript": "^5.9.3"
|
|
100
|
+
}
|
|
101
|
+
}
|