@tricoteuses/senat 1.4.1 → 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -6
- package/lib/datasets.d.ts +4 -3
- package/lib/datasets.js +3 -3
- package/lib/index.d.ts +1 -1
- package/lib/index.js +1 -1
- package/lib/loaders.d.ts +4 -3
- package/lib/loaders.js +7 -6
- package/lib/model/agenda.d.ts +2 -0
- package/lib/model/agenda.js +85 -0
- package/lib/model/dosleg.d.ts +2 -2
- package/lib/model/dosleg.js +4 -2
- package/lib/scripts/convert_data.js +116 -102
- package/lib/scripts/datautil.d.ts +1 -2
- package/lib/scripts/datautil.js +1 -1
- package/lib/scripts/parse_textes.js +4 -4
- package/lib/scripts/retrieve_agenda.d.ts +1 -0
- package/lib/scripts/retrieve_agenda.js +73 -0
- package/lib/scripts/retrieve_documents.js +11 -16
- package/lib/scripts/retrieve_open_data.js +4 -3
- package/lib/scripts/shared/cli_helpers.d.ts +11 -0
- package/lib/scripts/shared/cli_helpers.js +8 -1
- package/lib/types/agenda.d.ts +12 -0
- package/lib/types/agenda.js +1 -0
- package/lib/types/sessions.d.ts +5 -42
- package/lib/types/sessions.js +84 -43
- package/package.json +3 -2
package/README.md
CHANGED
|
@@ -43,19 +43,22 @@ npm run data:download ../senat-data -- [--categories All]
|
|
|
43
43
|
|
|
44
44
|
Data from other sources is also available :
|
|
45
45
|
```bash
|
|
46
|
-
# Retrieval of sénateurs' pictures from Sénat's website
|
|
47
|
-
npm run data:retrieve_senateurs_photos ../senat-data
|
|
48
|
-
|
|
49
46
|
# Retrieval of textes and rapports from Sénat's website
|
|
50
47
|
# Available options for optional `formats` parameter : xml, html, pdf
|
|
51
48
|
# Available options for optional `types` parameter : textes, rapports
|
|
52
|
-
npm run data:retrieve_documents ../senat-data -- --
|
|
49
|
+
npm run data:retrieve_documents ../senat-data -- --fromSession 2023 [--formats xml pdf] [--types textes]
|
|
53
50
|
|
|
54
51
|
# Retrieval & parsing (textes in xml format only for now)
|
|
55
|
-
npm run data:retrieve_documents ../senat-data -- --
|
|
52
|
+
npm run data:retrieve_documents ../senat-data -- --fromSession 2023 --parseDocuments
|
|
56
53
|
|
|
57
54
|
# Parsing only
|
|
58
55
|
npm run data:parse_textes_lois ../senat-data
|
|
56
|
+
|
|
57
|
+
# Retrieval of agenda from Sénat's website
|
|
58
|
+
npm run data:retrieve_agenda ../senat-data -- --fromSession 2023
|
|
59
|
+
|
|
60
|
+
# Retrieval of sénateurs' pictures from Sénat's website
|
|
61
|
+
npm run data:retrieve_senateurs_photos ../senat-data
|
|
59
62
|
```
|
|
60
63
|
|
|
61
64
|
## Data download using Docker
|
|
@@ -67,7 +70,7 @@ docker volume create senat-data # Create a volume to download the data
|
|
|
67
70
|
docker run --name tricoteuses-senat -v senat-data:/app/senat-data -d registry.en-root.org/tricoteuses/tricoteuses-senat:latest
|
|
68
71
|
```
|
|
69
72
|
|
|
70
|
-
Use the environment variable `CATEGORIES` and `
|
|
73
|
+
Use the environment variable `CATEGORIES` and `FROM_SESSION` if needed.
|
|
71
74
|
|
|
72
75
|
## Using the data
|
|
73
76
|
|
package/lib/datasets.d.ts
CHANGED
|
@@ -21,8 +21,9 @@ export declare enum EnabledDatasets {
|
|
|
21
21
|
Questions = 8,
|
|
22
22
|
Sens = 16,
|
|
23
23
|
PhotosSenateurs = 32,
|
|
24
|
-
|
|
24
|
+
Agenda = 64,
|
|
25
|
+
All = 127
|
|
25
26
|
}
|
|
26
27
|
export declare const datasets: Datasets;
|
|
27
|
-
export declare function getEnabledDatasets(categories:
|
|
28
|
-
export declare function
|
|
28
|
+
export declare function getEnabledDatasets(categories: string[]): EnabledDatasets;
|
|
29
|
+
export declare function getChosenDatasets(enabledDatasets: EnabledDatasets): Dataset[];
|
package/lib/datasets.js
CHANGED
|
@@ -10,7 +10,8 @@ export var EnabledDatasets;
|
|
|
10
10
|
EnabledDatasets[EnabledDatasets["Questions"] = 8] = "Questions";
|
|
11
11
|
EnabledDatasets[EnabledDatasets["Sens"] = 16] = "Sens";
|
|
12
12
|
EnabledDatasets[EnabledDatasets["PhotosSenateurs"] = 32] = "PhotosSenateurs";
|
|
13
|
-
EnabledDatasets[EnabledDatasets["
|
|
13
|
+
EnabledDatasets[EnabledDatasets["Agenda"] = 64] = "Agenda";
|
|
14
|
+
EnabledDatasets[EnabledDatasets["All"] = 127] = "All";
|
|
14
15
|
})(EnabledDatasets || (EnabledDatasets = {}));
|
|
15
16
|
export const datasets = {
|
|
16
17
|
ameli: {
|
|
@@ -68,8 +69,7 @@ export function getEnabledDatasets(categories) {
|
|
|
68
69
|
return enabledDatasets | (enabledDataset || EnabledDatasets.None);
|
|
69
70
|
}, EnabledDatasets.None);
|
|
70
71
|
}
|
|
71
|
-
export function
|
|
72
|
-
const enabledDatasets = getEnabledDatasets(categories);
|
|
72
|
+
export function getChosenDatasets(enabledDatasets) {
|
|
73
73
|
return [
|
|
74
74
|
enabledDatasets & EnabledDatasets.Ameli ? datasets.ameli : null,
|
|
75
75
|
enabledDatasets & EnabledDatasets.Debats ? datasets.debats : null,
|
package/lib/index.d.ts
CHANGED
|
@@ -6,5 +6,5 @@ export type { Ses, Sub, TxtAmeli } from "./types/ameli";
|
|
|
6
6
|
export type { Debat, LecAssDeb } from "./types/debats";
|
|
7
7
|
export type { Ass, Aud, Auteur, DateSeance, DecCoc, DenRap, DocAtt, Ecr, EtaLoi, LecAss, LecAssRap, Lecture, Loi, Org, OriTxt, Qua, Rap, RapOrg, Scr, Texte, TypAtt, TypLec, TypLoi, TypTxt, TypUrl, } from "./types/dosleg";
|
|
8
8
|
export type { Photo, Sen } from "./types/sens";
|
|
9
|
-
export { Session } from "./types/sessions";
|
|
9
|
+
export type { UNDEFINED_SESSION, Session, getSessionsFromStart } from "./types/sessions";
|
|
10
10
|
export type { Alinea, Article, Division, DivisionTag, FlatTexte } from "./types/texte";
|
package/lib/index.js
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
export {
|
|
1
|
+
export {};
|
package/lib/loaders.d.ts
CHANGED
|
@@ -4,15 +4,16 @@ import { QuestionResult } from "./model/questions";
|
|
|
4
4
|
import { CirconscriptionResult, OrganismeResult, SenateurResult } from "./model/sens";
|
|
5
5
|
import { FlatTexte } from "./types/texte";
|
|
6
6
|
export { EnabledDatasets } from "./datasets";
|
|
7
|
+
export declare const AGENDA_FOLDER = "agenda";
|
|
7
8
|
export declare const DOSLEG_DOSSIERS_FOLDER = "dossiers";
|
|
9
|
+
export declare const RAPPORT_FOLDER = "rap";
|
|
8
10
|
export declare const SENS_CIRCONSCRIPTIONS_FOLDER = "circonscriptions";
|
|
9
11
|
export declare const SENS_ORGANISMES_FOLDER = "organismes";
|
|
10
12
|
export declare const SENS_SENATEURS_FOLDER = "senateurs";
|
|
11
13
|
export declare const TEXTE_FOLDER = "leg";
|
|
12
|
-
export declare const
|
|
13
|
-
export declare const
|
|
14
|
+
export declare const DATA_ORIGINAL_FOLDER = "original";
|
|
15
|
+
export declare const DATA_TRANSFORMED_FOLDER = "transformed";
|
|
14
16
|
export declare const DOCUMENT_METADATA_FILE = "metadata.json";
|
|
15
|
-
export declare const RAPPORT_FOLDER = "rap";
|
|
16
17
|
type IterItem<T> = {
|
|
17
18
|
item: T;
|
|
18
19
|
filePathFromDataset?: string;
|
package/lib/loaders.js
CHANGED
|
@@ -2,17 +2,18 @@ import fs from "fs";
|
|
|
2
2
|
import path from "path";
|
|
3
3
|
import legislatures from "./legislatures.json";
|
|
4
4
|
import { datasets } from "./datasets";
|
|
5
|
-
import { UNDEFINED_SESSION } from "./
|
|
5
|
+
import { UNDEFINED_SESSION } from "./types/sessions";
|
|
6
6
|
export { EnabledDatasets } from "./datasets";
|
|
7
|
+
export const AGENDA_FOLDER = "agenda";
|
|
7
8
|
export const DOSLEG_DOSSIERS_FOLDER = "dossiers";
|
|
9
|
+
export const RAPPORT_FOLDER = "rap";
|
|
8
10
|
export const SENS_CIRCONSCRIPTIONS_FOLDER = "circonscriptions";
|
|
9
11
|
export const SENS_ORGANISMES_FOLDER = "organismes";
|
|
10
12
|
export const SENS_SENATEURS_FOLDER = "senateurs";
|
|
11
13
|
export const TEXTE_FOLDER = "leg";
|
|
12
|
-
export const
|
|
13
|
-
export const
|
|
14
|
+
export const DATA_ORIGINAL_FOLDER = "original";
|
|
15
|
+
export const DATA_TRANSFORMED_FOLDER = "transformed";
|
|
14
16
|
export const DOCUMENT_METADATA_FILE = "metadata.json";
|
|
15
|
-
export const RAPPORT_FOLDER = "rap";
|
|
16
17
|
export function* iterFilePaths(dirPath) {
|
|
17
18
|
if (dirPath && fs.existsSync(dirPath)) {
|
|
18
19
|
const files = fs.readdirSync(dirPath, {
|
|
@@ -59,7 +60,7 @@ export function* iterLoadSenatDossiersLegislatifs(dataDir, session, options = {}
|
|
|
59
60
|
}
|
|
60
61
|
}
|
|
61
62
|
export function* iterLoadSenatDossiersLegislatifsTexteUrls(dataDir, session) {
|
|
62
|
-
let itemsDir = path.join(dataDir, TEXTE_FOLDER,
|
|
63
|
+
let itemsDir = path.join(dataDir, TEXTE_FOLDER, DATA_ORIGINAL_FOLDER);
|
|
63
64
|
if (session) {
|
|
64
65
|
itemsDir = path.join(itemsDir, session.toString());
|
|
65
66
|
}
|
|
@@ -121,7 +122,7 @@ export function* iterLoadSenatDossiersLegislatifsTextes(dataDir, session, option
|
|
|
121
122
|
export function loadSenatTexteContent(dataDir, textePathFromDataset) {
|
|
122
123
|
const parsedTextePath = path.parse(textePathFromDataset);
|
|
123
124
|
const jsonTexteName = `${parsedTextePath.name}.json`;
|
|
124
|
-
const fullTextePath = path.join(dataDir, TEXTE_FOLDER,
|
|
125
|
+
const fullTextePath = path.join(dataDir, TEXTE_FOLDER, DATA_TRANSFORMED_FOLDER, parsedTextePath.dir, jsonTexteName);
|
|
125
126
|
if (!fs.existsSync(fullTextePath)) {
|
|
126
127
|
return { item: null };
|
|
127
128
|
}
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import { JSDOM } from "jsdom";
|
|
2
|
+
import { DateTime } from "luxon";
|
|
3
|
+
function getEventType(eventClasses) {
|
|
4
|
+
const typeClass = [...eventClasses]
|
|
5
|
+
.find(className => className.startsWith("evt-"))
|
|
6
|
+
|| null;
|
|
7
|
+
switch (typeClass) {
|
|
8
|
+
case "evt-seance":
|
|
9
|
+
return "Séance publique";
|
|
10
|
+
case "evt-instanz":
|
|
11
|
+
return "Commissions";
|
|
12
|
+
case "evt-cemi":
|
|
13
|
+
return "Mission de contrôle";
|
|
14
|
+
case "evt-deleg":
|
|
15
|
+
return "Offices et délégations";
|
|
16
|
+
case "evt-bureau":
|
|
17
|
+
return "Instances décisionnelles";
|
|
18
|
+
}
|
|
19
|
+
return null;
|
|
20
|
+
}
|
|
21
|
+
function getUrlDossierSenat(lienElements) {
|
|
22
|
+
const urlElement = [...lienElements]
|
|
23
|
+
.find(lienElement => lienElement.textContent?.includes("dossier législatif"));
|
|
24
|
+
return urlElement ? urlElement.getAttribute("href") : null;
|
|
25
|
+
}
|
|
26
|
+
function getHeuresDebutFin(timeStr) {
|
|
27
|
+
const normalizedHeureDebut = timeStr
|
|
28
|
+
?.replace(/^À l'issue de l'espace réservé .* et au plus tard\s/i, "") // Must be first
|
|
29
|
+
?.replace(/^(?:le )?matin/i, "10h00")
|
|
30
|
+
?.replace(/^(?:l')?après-midi/i, "16h00")
|
|
31
|
+
?.replace(/^(?:le )?soir/i, "20h00")
|
|
32
|
+
?.replace(/^(?:la )?nuit/i, "22h00")
|
|
33
|
+
?.replace(/^à\s/ig, "")
|
|
34
|
+
?.replace(/heures/ig, "h00")
|
|
35
|
+
?.replace(/\set.*/i, "")
|
|
36
|
+
?.replace(/,.*/, "")
|
|
37
|
+
?.replace(/\s\(hors hémicycle\)/i, "")
|
|
38
|
+
?.replace(/\s*h\s*/ig, "h");
|
|
39
|
+
console.dir(`${timeStr};${normalizedHeureDebut}`);
|
|
40
|
+
const heureDebut = timeStr
|
|
41
|
+
? DateTime.fromFormat(timeStr, "H'h'mm").toISOTime()
|
|
42
|
+
: null;
|
|
43
|
+
return {
|
|
44
|
+
heureDebut,
|
|
45
|
+
heureFin: null
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
function transformAgenda(document) {
|
|
49
|
+
const agendaEvents = [];
|
|
50
|
+
const eventElements = document.querySelectorAll(".evt");
|
|
51
|
+
for (const eventElement of eventElements) {
|
|
52
|
+
const type = getEventType(eventElement.classList);
|
|
53
|
+
const timeOriginal = eventElement.querySelector(".time")?.textContent || null;
|
|
54
|
+
const { heureDebut, heureFin } = getHeuresDebutFin(timeOriginal);
|
|
55
|
+
const titre = eventElement.querySelector(".titre")?.textContent || null;
|
|
56
|
+
const organe = eventElement.querySelector(".organe")?.textContent || null;
|
|
57
|
+
const objet = eventElement.querySelector(".objet")?.textContent || null;
|
|
58
|
+
const lieu = eventElement.querySelector(".lieu")?.textContent || null;
|
|
59
|
+
const url_dossier_senat = getUrlDossierSenat(eventElement.querySelectorAll(".lien a"));
|
|
60
|
+
const url_video = eventElement.querySelector(".video a")?.getAttribute("href") || null;
|
|
61
|
+
agendaEvents.push({
|
|
62
|
+
type,
|
|
63
|
+
heureDebut,
|
|
64
|
+
heureFin,
|
|
65
|
+
timeOriginal,
|
|
66
|
+
titre,
|
|
67
|
+
organe,
|
|
68
|
+
objet,
|
|
69
|
+
lieu,
|
|
70
|
+
url_dossier_senat,
|
|
71
|
+
url_video,
|
|
72
|
+
});
|
|
73
|
+
}
|
|
74
|
+
return agendaEvents;
|
|
75
|
+
}
|
|
76
|
+
export async function parseAgendaFromFile(htmlFilePath) {
|
|
77
|
+
try {
|
|
78
|
+
const { document } = (await JSDOM.fromFile(htmlFilePath, { contentType: "text/html" })).window;
|
|
79
|
+
return transformAgenda(document);
|
|
80
|
+
}
|
|
81
|
+
catch (error) {
|
|
82
|
+
console.error(`Could not parse texte with error ${error}`);
|
|
83
|
+
}
|
|
84
|
+
return null;
|
|
85
|
+
}
|
package/lib/model/dosleg.d.ts
CHANGED
|
@@ -214,12 +214,12 @@ export declare function findAll(): AsyncIterableIterator<{
|
|
|
214
214
|
numero_JO: string | null;
|
|
215
215
|
url_JO: string | null;
|
|
216
216
|
}>;
|
|
217
|
-
export declare function findSenatTexteUrls(sessions?:
|
|
217
|
+
export declare function findSenatTexteUrls(sessions?: number[]): AsyncIterableIterator<{
|
|
218
218
|
session: string | null | undefined;
|
|
219
219
|
url: string;
|
|
220
220
|
hasExposeDesMotifs: boolean;
|
|
221
221
|
}>;
|
|
222
|
-
export declare function findSenatRapportUrls(sessions?:
|
|
222
|
+
export declare function findSenatRapportUrls(sessions?: number[]): AsyncIterableIterator<{
|
|
223
223
|
url: string;
|
|
224
224
|
session: string | null | undefined;
|
|
225
225
|
}>;
|
package/lib/model/dosleg.js
CHANGED
|
@@ -170,11 +170,12 @@ export function findAll() {
|
|
|
170
170
|
return findAllQuery.stream();
|
|
171
171
|
}
|
|
172
172
|
export function findSenatTexteUrls(sessions = []) {
|
|
173
|
+
const sessionsStr = sessions.map(session => String(session));
|
|
173
174
|
return dbDosleg
|
|
174
175
|
.selectFrom("texte")
|
|
175
176
|
.where("texurl", "is not", null)
|
|
176
177
|
.where("typurl", "=", "I")
|
|
177
|
-
.$if(sessions.length > 0, (qb) => qb.where("sesann", "in",
|
|
178
|
+
.$if(sessions.length > 0, (qb) => qb.where("sesann", "in", sessionsStr))
|
|
178
179
|
.select(({ eb, ref }) => [
|
|
179
180
|
"sesann as session",
|
|
180
181
|
rtrim(ref("texurl")).as("url"),
|
|
@@ -190,11 +191,12 @@ export function findSenatTexteUrls(sessions = []) {
|
|
|
190
191
|
.stream();
|
|
191
192
|
}
|
|
192
193
|
export function findSenatRapportUrls(sessions = []) {
|
|
194
|
+
const sessionsStr = sessions.map(session => String(session));
|
|
193
195
|
return dbDosleg
|
|
194
196
|
.selectFrom("rap")
|
|
195
197
|
.where("rapurl", "is not", null)
|
|
196
198
|
.where("typurl", "=", "I")
|
|
197
|
-
.$if(sessions.length > 0, (qb) => qb.where("sesann", "in",
|
|
199
|
+
.$if(sessions.length > 0, (qb) => qb.where("sesann", "in", sessionsStr))
|
|
198
200
|
.select(({ ref }) => [rtrim(ref("rapurl")).as("url"), "sesann as session"])
|
|
199
201
|
.$narrowType()
|
|
200
202
|
.stream();
|
|
@@ -3,10 +3,11 @@ import commandLineArgs from "command-line-args";
|
|
|
3
3
|
import fs from "fs-extra";
|
|
4
4
|
import path from "path";
|
|
5
5
|
import { datasets, EnabledDatasets, getEnabledDatasets } from "../datasets";
|
|
6
|
-
import { DOCUMENT_METADATA_FILE, DOSLEG_DOSSIERS_FOLDER, RAPPORT_FOLDER, SENS_CIRCONSCRIPTIONS_FOLDER, SENS_ORGANISMES_FOLDER, SENS_SENATEURS_FOLDER, TEXTE_FOLDER,
|
|
6
|
+
import { DOCUMENT_METADATA_FILE, DOSLEG_DOSSIERS_FOLDER, RAPPORT_FOLDER, SENS_CIRCONSCRIPTIONS_FOLDER, SENS_ORGANISMES_FOLDER, SENS_SENATEURS_FOLDER, TEXTE_FOLDER, DATA_ORIGINAL_FOLDER, } from "../loaders";
|
|
7
7
|
import { findAllAmendements, findAllCirconscriptions, findAllLois, findAllOrganismes, findAllQuestions, findAllSens, } from "../model";
|
|
8
8
|
import { findSenatRapportUrls, findSenatTexteUrls } from "../model/dosleg";
|
|
9
|
-
import {
|
|
9
|
+
import { getSessionsFromStart, UNDEFINED_SESSION } from "../types/sessions";
|
|
10
|
+
import { formatToFourDigitSession, SIGNET_STRUCTURE_REGEXP, } from "./datautil";
|
|
10
11
|
import { commonOptions } from "./shared/cli_helpers";
|
|
11
12
|
import { ensureAndClearDir } from "./shared/util";
|
|
12
13
|
const optionsDefinitions = [...commonOptions];
|
|
@@ -16,123 +17,99 @@ const SENAT_TEXTE_BASE_URL = "https://www.senat.fr/leg/";
|
|
|
16
17
|
const SENAT_EXPOSE_DES_MOTIFS_BASE_URL = "https://www.senat.fr/leg/exposes-des-motifs/";
|
|
17
18
|
const SENAT_RAPPORT_BASE_URL = "https://www.senat.fr/rap/";
|
|
18
19
|
async function convertData() {
|
|
19
|
-
const enabledDatasets = getEnabledDatasets(options["categories"]);
|
|
20
20
|
const dataDir = options["dataDir"];
|
|
21
21
|
assert(dataDir, "Missing argument: data directory");
|
|
22
|
+
const enabledDatasets = getEnabledDatasets(options["categories"]);
|
|
23
|
+
const sessions = getSessionsFromStart(options["fromSession"]);
|
|
22
24
|
console.time("data transformation time");
|
|
23
25
|
if (enabledDatasets & EnabledDatasets.Ameli) {
|
|
24
|
-
|
|
25
|
-
if (!options["silent"]) {
|
|
26
|
-
console.log(`Converting database ${dataset.database} data into files…`);
|
|
27
|
-
}
|
|
28
|
-
const ameliReorganizedRootDir = path.join(dataDir, dataset.database);
|
|
29
|
-
ensureAndClearDir(ameliReorganizedRootDir);
|
|
30
|
-
for await (const amendement of findAllAmendements()) {
|
|
31
|
-
if (options["verbose"]) {
|
|
32
|
-
console.log(`Converting ${amendement.numero} file…`);
|
|
33
|
-
}
|
|
34
|
-
const session = String(amendement.session) || UNDEFINED_SESSION;
|
|
35
|
-
const signetDossierLegislatif = amendement.signet_dossier_legislatif ||
|
|
36
|
-
`${amendement.nature_texte}-${amendement.numero_texte}`.toLowerCase();
|
|
37
|
-
const ameliReorganizedDir = path.join(ameliReorganizedRootDir, session, signetDossierLegislatif);
|
|
38
|
-
fs.ensureDirSync(ameliReorganizedDir);
|
|
39
|
-
const amendementFileName = `${amendement.numero}.json`;
|
|
40
|
-
fs.writeJSONSync(path.join(ameliReorganizedDir, amendementFileName), amendement, { spaces: 2 });
|
|
41
|
-
}
|
|
26
|
+
await convertDatasetAmeli(dataDir);
|
|
42
27
|
}
|
|
43
28
|
if (enabledDatasets & EnabledDatasets.DosLeg) {
|
|
44
|
-
|
|
45
|
-
if (!options["silent"]) {
|
|
46
|
-
console.log(`Converting database ${dataset.database} data into files…`);
|
|
47
|
-
}
|
|
48
|
-
const doslegReorganizedRootDir = path.join(dataDir, dataset.database);
|
|
49
|
-
const dossiersReorganizedDir = path.join(doslegReorganizedRootDir, DOSLEG_DOSSIERS_FOLDER);
|
|
50
|
-
ensureAndClearDir(doslegReorganizedRootDir);
|
|
51
|
-
ensureAndClearDir(dossiersReorganizedDir);
|
|
52
|
-
for await (const loi of findAllLois()) {
|
|
53
|
-
if (options["verbose"]) {
|
|
54
|
-
console.log(`Converting ${loi.signet} file…`);
|
|
55
|
-
}
|
|
56
|
-
let loiReorganizedDir = path.join(dossiersReorganizedDir, UNDEFINED_SESSION);
|
|
57
|
-
const signetParts = SIGNET_STRUCTURE_REGEXP.exec(loi.signet)?.groups;
|
|
58
|
-
if (signetParts && "session" in signetParts) {
|
|
59
|
-
const { session } = signetParts;
|
|
60
|
-
const formattedSession = formatToFourDigitSession(session);
|
|
61
|
-
loiReorganizedDir = path.join(dossiersReorganizedDir, formattedSession);
|
|
62
|
-
}
|
|
63
|
-
fs.ensureDirSync(loiReorganizedDir);
|
|
64
|
-
const loiFileName = `${loi.signet}.json`;
|
|
65
|
-
fs.writeJSONSync(path.join(loiReorganizedDir, loiFileName), loi, {
|
|
66
|
-
spaces: 2,
|
|
67
|
-
});
|
|
68
|
-
}
|
|
69
|
-
await convertTexteUrls(dataDir);
|
|
70
|
-
await convertRapportUrls(dataDir);
|
|
29
|
+
await convertDatasetDosLeg(dataDir, sessions);
|
|
71
30
|
}
|
|
72
31
|
if (enabledDatasets & EnabledDatasets.Questions) {
|
|
73
|
-
|
|
74
|
-
if (!options["silent"]) {
|
|
75
|
-
console.log(`Converting database ${dataset.database} data into files…`);
|
|
76
|
-
}
|
|
77
|
-
const questionsReorganizedRootDir = path.join(dataDir, dataset.database);
|
|
78
|
-
ensureAndClearDir(questionsReorganizedRootDir);
|
|
79
|
-
for await (const question of findAllQuestions()) {
|
|
80
|
-
if (options["verbose"]) {
|
|
81
|
-
console.log(`Converting ${question.reference} file…`);
|
|
82
|
-
}
|
|
83
|
-
const legislature = question.legislature ? question.legislature : 0;
|
|
84
|
-
const questionReorganizedDir = path.join(questionsReorganizedRootDir, String(legislature));
|
|
85
|
-
fs.ensureDirSync(questionReorganizedDir);
|
|
86
|
-
const questionFileName = `${question.reference}.json`;
|
|
87
|
-
fs.writeJSONSync(path.join(questionReorganizedDir, questionFileName), question, { spaces: 2 });
|
|
88
|
-
}
|
|
32
|
+
await convertDatasetQuestions(dataDir);
|
|
89
33
|
}
|
|
90
34
|
if (enabledDatasets & EnabledDatasets.Sens) {
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
}
|
|
107
|
-
const senFileName = `${sen.matricule}.json`;
|
|
108
|
-
fs.writeJSONSync(path.join(senateursReorganizedDir, senFileName), sen, {
|
|
109
|
-
spaces: 2,
|
|
110
|
-
});
|
|
35
|
+
await convertDatasetSens(dataDir);
|
|
36
|
+
}
|
|
37
|
+
if (!options["silent"]) {
|
|
38
|
+
console.timeEnd("data transformation time");
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
async function convertDatasetAmeli(dataDir) {
|
|
42
|
+
const dataset = datasets.ameli;
|
|
43
|
+
if (!options["silent"]) {
|
|
44
|
+
console.log(`Converting database ${dataset.database} data into files…`);
|
|
45
|
+
}
|
|
46
|
+
const ameliReorganizedRootDir = path.join(dataDir, dataset.database);
|
|
47
|
+
ensureAndClearDir(ameliReorganizedRootDir);
|
|
48
|
+
for await (const amendement of findAllAmendements()) {
|
|
49
|
+
if (options["verbose"]) {
|
|
50
|
+
console.log(`Converting ${amendement.numero} file…`);
|
|
111
51
|
}
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
52
|
+
const session = String(amendement.session) || UNDEFINED_SESSION;
|
|
53
|
+
const signetDossierLegislatif = amendement.signet_dossier_legislatif ||
|
|
54
|
+
`${amendement.nature_texte}-${amendement.numero_texte}`.toLowerCase();
|
|
55
|
+
const ameliReorganizedDir = path.join(ameliReorganizedRootDir, String(session), signetDossierLegislatif);
|
|
56
|
+
fs.ensureDirSync(ameliReorganizedDir);
|
|
57
|
+
const amendementFileName = `${amendement.numero}.json`;
|
|
58
|
+
fs.writeJSONSync(path.join(ameliReorganizedDir, amendementFileName), amendement, { spaces: 2 });
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
async function convertDatasetDosLeg(dataDir, sessions) {
|
|
62
|
+
const dataset = datasets.dosleg;
|
|
63
|
+
if (!options["silent"]) {
|
|
64
|
+
console.log(`Converting database ${dataset.database} data into files…`);
|
|
65
|
+
}
|
|
66
|
+
const doslegReorganizedRootDir = path.join(dataDir, dataset.database);
|
|
67
|
+
const dossiersReorganizedDir = path.join(doslegReorganizedRootDir, DOSLEG_DOSSIERS_FOLDER);
|
|
68
|
+
ensureAndClearDir(doslegReorganizedRootDir);
|
|
69
|
+
ensureAndClearDir(dossiersReorganizedDir);
|
|
70
|
+
for await (const loi of findAllLois()) {
|
|
71
|
+
if (options["verbose"]) {
|
|
72
|
+
console.log(`Converting ${loi.signet} file…`);
|
|
118
73
|
}
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
}
|
|
123
|
-
const
|
|
124
|
-
|
|
74
|
+
let loiReorganizedDir = path.join(dossiersReorganizedDir, String(UNDEFINED_SESSION));
|
|
75
|
+
const signetParts = SIGNET_STRUCTURE_REGEXP.exec(loi.signet)?.groups;
|
|
76
|
+
if (signetParts && "session" in signetParts) {
|
|
77
|
+
const { session } = signetParts;
|
|
78
|
+
const formattedSession = formatToFourDigitSession(session);
|
|
79
|
+
loiReorganizedDir = path.join(dossiersReorganizedDir, String(formattedSession));
|
|
125
80
|
}
|
|
81
|
+
fs.ensureDirSync(loiReorganizedDir);
|
|
82
|
+
const loiFileName = `${loi.signet}.json`;
|
|
83
|
+
fs.writeJSONSync(path.join(loiReorganizedDir, loiFileName), loi, {
|
|
84
|
+
spaces: 2,
|
|
85
|
+
});
|
|
126
86
|
}
|
|
87
|
+
await convertTexteUrls(dataDir, sessions);
|
|
88
|
+
await convertRapportUrls(dataDir, sessions);
|
|
89
|
+
}
|
|
90
|
+
async function convertDatasetQuestions(dataDir) {
|
|
91
|
+
const dataset = datasets.questions;
|
|
127
92
|
if (!options["silent"]) {
|
|
128
|
-
console.
|
|
93
|
+
console.log(`Converting database ${dataset.database} data into files…`);
|
|
94
|
+
}
|
|
95
|
+
const questionsReorganizedRootDir = path.join(dataDir, dataset.database);
|
|
96
|
+
ensureAndClearDir(questionsReorganizedRootDir);
|
|
97
|
+
for await (const question of findAllQuestions()) {
|
|
98
|
+
if (options["verbose"]) {
|
|
99
|
+
console.log(`Converting ${question.reference} file…`);
|
|
100
|
+
}
|
|
101
|
+
const legislature = question.legislature ? question.legislature : 0;
|
|
102
|
+
const questionReorganizedDir = path.join(questionsReorganizedRootDir, String(legislature));
|
|
103
|
+
fs.ensureDirSync(questionReorganizedDir);
|
|
104
|
+
const questionFileName = `${question.reference}.json`;
|
|
105
|
+
fs.writeJSONSync(path.join(questionReorganizedDir, questionFileName), question, { spaces: 2 });
|
|
129
106
|
}
|
|
130
107
|
}
|
|
131
|
-
async function convertTexteUrls(dataDir) {
|
|
108
|
+
async function convertTexteUrls(dataDir, sessions) {
|
|
132
109
|
const textesDir = path.join(dataDir, TEXTE_FOLDER);
|
|
133
110
|
fs.ensureDirSync(textesDir);
|
|
134
|
-
const originalTextesDir = path.join(textesDir,
|
|
135
|
-
for await (const texte of findSenatTexteUrls(
|
|
111
|
+
const originalTextesDir = path.join(textesDir, DATA_ORIGINAL_FOLDER);
|
|
112
|
+
for await (const texte of findSenatTexteUrls(sessions)) {
|
|
136
113
|
const texteName = path.parse(texte.url).name;
|
|
137
114
|
const texteDir = path.join(originalTextesDir, `${texte.session ?? UNDEFINED_SESSION}`, texteName);
|
|
138
115
|
fs.ensureDirSync(texteDir);
|
|
@@ -151,10 +128,10 @@ async function convertTexteUrls(dataDir) {
|
|
|
151
128
|
});
|
|
152
129
|
}
|
|
153
130
|
}
|
|
154
|
-
async function convertRapportUrls(dataDir) {
|
|
131
|
+
async function convertRapportUrls(dataDir, sessions) {
|
|
155
132
|
const rapportsDir = path.join(dataDir, RAPPORT_FOLDER);
|
|
156
133
|
fs.ensureDirSync(rapportsDir);
|
|
157
|
-
for await (const rapport of findSenatRapportUrls(
|
|
134
|
+
for await (const rapport of findSenatRapportUrls(sessions)) {
|
|
158
135
|
const parsedRapportUrl = path.parse(rapport.url);
|
|
159
136
|
const rapportName = parsedRapportUrl.name;
|
|
160
137
|
const rapportDir = path.join(rapportsDir, `${rapport.session ?? UNDEFINED_SESSION}`, rapportName);
|
|
@@ -180,6 +157,43 @@ async function convertRapportUrls(dataDir) {
|
|
|
180
157
|
});
|
|
181
158
|
}
|
|
182
159
|
}
|
|
160
|
+
async function convertDatasetSens(dataDir) {
|
|
161
|
+
const dataset = datasets.sens;
|
|
162
|
+
if (!options["silent"]) {
|
|
163
|
+
console.log(`Converting database ${dataset.database} data into files…`);
|
|
164
|
+
}
|
|
165
|
+
const sensReorganizedRootDir = path.join(dataDir, dataset.database);
|
|
166
|
+
const senateursReorganizedDir = path.join(sensReorganizedRootDir, SENS_SENATEURS_FOLDER);
|
|
167
|
+
const circonscriptionsReorganizedDir = path.join(sensReorganizedRootDir, SENS_CIRCONSCRIPTIONS_FOLDER);
|
|
168
|
+
const organismesReorganizedDir = path.join(sensReorganizedRootDir, SENS_ORGANISMES_FOLDER);
|
|
169
|
+
ensureAndClearDir(sensReorganizedRootDir);
|
|
170
|
+
ensureAndClearDir(senateursReorganizedDir);
|
|
171
|
+
ensureAndClearDir(circonscriptionsReorganizedDir);
|
|
172
|
+
ensureAndClearDir(organismesReorganizedDir);
|
|
173
|
+
for await (const sen of findAllSens()) {
|
|
174
|
+
if (options["verbose"]) {
|
|
175
|
+
console.log(`Converting ${sen.matricule} file…`);
|
|
176
|
+
}
|
|
177
|
+
const senFileName = `${sen.matricule}.json`;
|
|
178
|
+
fs.writeJSONSync(path.join(senateursReorganizedDir, senFileName), sen, {
|
|
179
|
+
spaces: 2,
|
|
180
|
+
});
|
|
181
|
+
}
|
|
182
|
+
for await (const circonscription of findAllCirconscriptions()) {
|
|
183
|
+
if (options["verbose"]) {
|
|
184
|
+
console.log(`Converting ${circonscription.identifiant} file…`);
|
|
185
|
+
}
|
|
186
|
+
const circonscriptionFileName = `${circonscription.identifiant}.json`;
|
|
187
|
+
fs.writeJSONSync(path.join(circonscriptionsReorganizedDir, circonscriptionFileName), circonscription, { spaces: 2 });
|
|
188
|
+
}
|
|
189
|
+
for await (const organisme of findAllOrganismes()) {
|
|
190
|
+
if (options["verbose"]) {
|
|
191
|
+
console.log(`Converting ${organisme.code} file…`);
|
|
192
|
+
}
|
|
193
|
+
const organismeFileName = `${organisme.code}.json`;
|
|
194
|
+
fs.writeJSONSync(path.join(organismesReorganizedDir, organismeFileName), organisme, { spaces: 2 });
|
|
195
|
+
}
|
|
196
|
+
}
|
|
183
197
|
convertData()
|
|
184
198
|
.then(() => process.exit(0))
|
|
185
199
|
.catch((error) => {
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
export declare const SIGNET_STRUCTURE_REGEXP: RegExp;
|
|
2
2
|
export declare const AKN_IDENTIFICATION_STRUCTURE_REGEXP: RegExp;
|
|
3
3
|
export declare const AKN_WORKFLOW_IDENTIFICATION_STRUCTURE_REGEXP: RegExp;
|
|
4
|
-
export declare
|
|
5
|
-
export declare function formatToFourDigitSession(session: string): string;
|
|
4
|
+
export declare function formatToFourDigitSession(session: string): string | 0;
|
package/lib/scripts/datautil.js
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
import { DateTime, Settings } from "luxon";
|
|
2
|
+
import { UNDEFINED_SESSION } from "../types/sessions";
|
|
2
3
|
Settings.twoDigitCutoffYear = 50;
|
|
3
4
|
export const SIGNET_STRUCTURE_REGEXP = /^(?<type>[a-z]+)(?<session>\d{2,4})-?(?<numTexte>\d+)?/;
|
|
4
5
|
export const AKN_IDENTIFICATION_STRUCTURE_REGEXP = /^\/akn\/fr\/(?<type>[a-z]+)\/(?<session>\d{4}-\d{4})\/?(?<numTexte>\d+)\/fr@(?<version>\b(?:RECT|RECT_BIS|RECT_TER|RECT_QUATER|RECT_QUINQUIES)\b)?/;
|
|
5
6
|
export const AKN_WORKFLOW_IDENTIFICATION_STRUCTURE_REGEXP = /^\/akn\/fr\/(?<type>[a-z]+)\/(?<session>\d{2,4})\/?(?<numTexte>[a-zA-Z0-9]+)\/fr@(?<version>\b(?:RECT|RECT_BIS|RECT_TER|RECT_QUATER|RECT_QUINQUIES)\b)?/;
|
|
6
|
-
export const UNDEFINED_SESSION = "0";
|
|
7
7
|
export function formatToFourDigitSession(session) {
|
|
8
8
|
if (session.length >= 2) {
|
|
9
9
|
const sessionFirstTwoDigits = session.substring(0, 2);
|
|
@@ -2,7 +2,7 @@ import assert from "assert";
|
|
|
2
2
|
import commandLineArgs from "command-line-args";
|
|
3
3
|
import fs from "fs-extra";
|
|
4
4
|
import path from "path";
|
|
5
|
-
import { iterFilePaths, TEXTE_FOLDER,
|
|
5
|
+
import { iterFilePaths, TEXTE_FOLDER, DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER, } from "../loaders";
|
|
6
6
|
import { parseExposeDesMotifsFromFile, parseTexteFromFile, } from "../model/texte";
|
|
7
7
|
import { commonOptions } from "./shared/cli_helpers";
|
|
8
8
|
import { ensureAndClearDir } from "./shared/util";
|
|
@@ -11,14 +11,14 @@ const options = commandLineArgs(optionsDefinitions);
|
|
|
11
11
|
async function main() {
|
|
12
12
|
const dataDir = options["dataDir"];
|
|
13
13
|
assert(dataDir, "Missing argument: data directory");
|
|
14
|
-
const transformedTextesDir = path.join(options["dataDir"], TEXTE_FOLDER,
|
|
14
|
+
const transformedTextesDir = path.join(options["dataDir"], TEXTE_FOLDER, DATA_TRANSFORMED_FOLDER);
|
|
15
15
|
ensureAndClearDir(transformedTextesDir);
|
|
16
|
-
for (const filePath of iterFilePaths(path.join(dataDir, TEXTE_FOLDER,
|
|
16
|
+
for (const filePath of iterFilePaths(path.join(dataDir, TEXTE_FOLDER, DATA_ORIGINAL_FOLDER))) {
|
|
17
17
|
const parsedFilePath = path.parse(filePath);
|
|
18
18
|
if (parsedFilePath.ext !== ".xml") {
|
|
19
19
|
continue;
|
|
20
20
|
}
|
|
21
|
-
const texteDirFromOriginal = parsedFilePath.dir.substring(filePath.indexOf(
|
|
21
|
+
const texteDirFromOriginal = parsedFilePath.dir.substring(filePath.indexOf(DATA_ORIGINAL_FOLDER) + DATA_ORIGINAL_FOLDER.length);
|
|
22
22
|
const transformedTexteDir = path.join(transformedTextesDir, texteDirFromOriginal);
|
|
23
23
|
fs.ensureDirSync(transformedTexteDir);
|
|
24
24
|
if (!options["silent"]) {
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import assert from "assert";
|
|
2
|
+
import commandLineArgs from "command-line-args";
|
|
3
|
+
import fs from "fs-extra";
|
|
4
|
+
import { DateTime } from "luxon";
|
|
5
|
+
import path from "path";
|
|
6
|
+
import { AGENDA_FOLDER, DATA_ORIGINAL_FOLDER } from "../loaders";
|
|
7
|
+
import { getSessionsFromStart } from "../types/sessions";
|
|
8
|
+
import { commonOptions } from "./shared/cli_helpers";
|
|
9
|
+
import { ensureAndClearDir } from "./shared/util";
|
|
10
|
+
const optionsDefinitions = [
|
|
11
|
+
...commonOptions,
|
|
12
|
+
{
|
|
13
|
+
help: "parse and convert documents into JSON (textes only for now, requires format xml)",
|
|
14
|
+
name: "parseDocuments",
|
|
15
|
+
type: Boolean,
|
|
16
|
+
},
|
|
17
|
+
];
|
|
18
|
+
const options = commandLineArgs(optionsDefinitions);
|
|
19
|
+
const SENAT_GLOBAL_AGENDA_URL_ROOT = "https://www.senat.fr/aglae/Global";
|
|
20
|
+
async function retrieveAgenda(dataDir, sessions) {
|
|
21
|
+
const agendaRootDir = path.join(dataDir, AGENDA_FOLDER);
|
|
22
|
+
ensureAndClearDir(agendaRootDir);
|
|
23
|
+
const originalAgendaDir = path.join(agendaRootDir, DATA_ORIGINAL_FOLDER);
|
|
24
|
+
fs.ensureDirSync(originalAgendaDir);
|
|
25
|
+
for (const session of sessions) {
|
|
26
|
+
if (!options["silent"]) {
|
|
27
|
+
console.log(`Retrieving Agenda for session ${session}…`);
|
|
28
|
+
}
|
|
29
|
+
const agendaSessionDir = path.join(originalAgendaDir, `${session}`);
|
|
30
|
+
fs.ensureDirSync(agendaSessionDir);
|
|
31
|
+
const fifteenDaysFromNow = new Date();
|
|
32
|
+
fifteenDaysFromNow.setDate(fifteenDaysFromNow.getDate() + 15);
|
|
33
|
+
for (const date = new Date(session, 0, 1); date <= new Date(session, 11, 31) && date <= fifteenDaysFromNow; date.setDate(date.getDate() + 1)) {
|
|
34
|
+
const agendaName = DateTime.fromJSDate(date).toFormat("ddMMyyyy");
|
|
35
|
+
try {
|
|
36
|
+
const response = await fetch(`${SENAT_GLOBAL_AGENDA_URL_ROOT}/agl${agendaName}.html`);
|
|
37
|
+
if (!response.ok) {
|
|
38
|
+
if (response.status === 404) {
|
|
39
|
+
console.warn(`Agenda ${agendaName} not found`);
|
|
40
|
+
}
|
|
41
|
+
else {
|
|
42
|
+
console.error(`An error occurred while retrieving Agenda ${agendaName}: ${response.status}`);
|
|
43
|
+
}
|
|
44
|
+
return;
|
|
45
|
+
}
|
|
46
|
+
const agendaContent = await response.arrayBuffer();
|
|
47
|
+
if (!agendaContent) {
|
|
48
|
+
return;
|
|
49
|
+
}
|
|
50
|
+
fs.writeFileSync(path.join(agendaSessionDir, agendaName), Buffer.from(agendaContent));
|
|
51
|
+
}
|
|
52
|
+
catch (error) {
|
|
53
|
+
console.error(error.message);
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
async function main() {
|
|
59
|
+
const dataDir = options["dataDir"];
|
|
60
|
+
assert(dataDir, "Missing argument: data directory");
|
|
61
|
+
const sessions = getSessionsFromStart(options["fromSession"]);
|
|
62
|
+
console.time("agenda processing time");
|
|
63
|
+
await retrieveAgenda(dataDir, sessions);
|
|
64
|
+
if (!options["silent"]) {
|
|
65
|
+
console.timeEnd("agenda processing time");
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
main()
|
|
69
|
+
.then(() => process.exit(0))
|
|
70
|
+
.catch((error) => {
|
|
71
|
+
console.log(error);
|
|
72
|
+
process.exit(1);
|
|
73
|
+
});
|
|
@@ -2,19 +2,13 @@ import assert from "assert";
|
|
|
2
2
|
import commandLineArgs from "command-line-args";
|
|
3
3
|
import fs from "fs-extra";
|
|
4
4
|
import path from "path";
|
|
5
|
-
import { iterLoadSenatDossiersLegislatifsRapportUrls, iterLoadSenatDossiersLegislatifsTexteUrls, RAPPORT_FOLDER, TEXTE_FOLDER,
|
|
5
|
+
import { iterLoadSenatDossiersLegislatifsRapportUrls, iterLoadSenatDossiersLegislatifsTexteUrls, RAPPORT_FOLDER, TEXTE_FOLDER, DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER, } from "../loaders";
|
|
6
6
|
import { parseExposeDesMotifs, parseTexte, parseTexteFromFile, } from "../model/texte";
|
|
7
|
-
import { UNDEFINED_SESSION } from "
|
|
7
|
+
import { getSessionsFromStart, UNDEFINED_SESSION } from "../types/sessions";
|
|
8
8
|
import { commonOptions } from "./shared/cli_helpers";
|
|
9
9
|
import { ensureAndClearDir, fetchWithRetry, isOptionEmptyOrHasValue, } from "./shared/util";
|
|
10
10
|
const optionsDefinitions = [
|
|
11
11
|
...commonOptions,
|
|
12
|
-
{
|
|
13
|
-
help: "sessions of textes to retrieve; leave empty for all",
|
|
14
|
-
multiple: true,
|
|
15
|
-
name: "sessions",
|
|
16
|
-
type: String,
|
|
17
|
-
},
|
|
18
12
|
{
|
|
19
13
|
help: "parse and convert documents into JSON (textes only for now, requires format xml)",
|
|
20
14
|
name: "parseDocuments",
|
|
@@ -63,18 +57,18 @@ async function retrieveDocument(documentUrl) {
|
|
|
63
57
|
return null;
|
|
64
58
|
}
|
|
65
59
|
}
|
|
66
|
-
async function retrieveTextes(dataDir) {
|
|
60
|
+
async function retrieveTextes(dataDir, sessions) {
|
|
67
61
|
const textesDir = path.join(dataDir, TEXTE_FOLDER);
|
|
68
62
|
fs.ensureDirSync(textesDir);
|
|
69
|
-
const originalTextesDir = path.join(textesDir,
|
|
70
|
-
const transformedTextesDir = path.join(textesDir,
|
|
63
|
+
const originalTextesDir = path.join(textesDir, DATA_ORIGINAL_FOLDER);
|
|
64
|
+
const transformedTextesDir = path.join(textesDir, DATA_TRANSFORMED_FOLDER);
|
|
71
65
|
if (options["parseDocuments"]) {
|
|
72
66
|
ensureAndClearDir(transformedTextesDir);
|
|
73
67
|
}
|
|
74
68
|
let retrievedTextesCount = 0;
|
|
75
69
|
const texteUrlsNotFoundOrError = [];
|
|
76
70
|
const texteUrlsParseError = [];
|
|
77
|
-
for (const session of
|
|
71
|
+
for (const session of sessions) {
|
|
78
72
|
for (const { item: texteMetadata, } of iterLoadSenatDossiersLegislatifsTexteUrls(dataDir, session)) {
|
|
79
73
|
const texteDir = path.join(originalTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name);
|
|
80
74
|
fs.ensureDirSync(texteDir);
|
|
@@ -180,12 +174,12 @@ async function retrieveTextes(dataDir) {
|
|
|
180
174
|
}
|
|
181
175
|
}
|
|
182
176
|
}
|
|
183
|
-
async function retrieveRapports(dataDir) {
|
|
177
|
+
async function retrieveRapports(dataDir, sessions) {
|
|
184
178
|
const rapportsDir = path.join(dataDir, RAPPORT_FOLDER);
|
|
185
179
|
fs.ensureDirSync(rapportsDir);
|
|
186
180
|
let retrievedRapportsCount = 0;
|
|
187
181
|
const rapportUrlsNotFoundOrError = [];
|
|
188
|
-
for (const session of
|
|
182
|
+
for (const session of sessions) {
|
|
189
183
|
for (const { item: rapportMetadata, } of iterLoadSenatDossiersLegislatifsRapportUrls(dataDir, session)) {
|
|
190
184
|
const rapportDir = path.join(rapportsDir, `${rapportMetadata.session ?? UNDEFINED_SESSION}`, rapportMetadata.name);
|
|
191
185
|
fs.ensureDirSync(rapportDir);
|
|
@@ -231,12 +225,13 @@ async function retrieveRapports(dataDir) {
|
|
|
231
225
|
async function main() {
|
|
232
226
|
const dataDir = options["dataDir"];
|
|
233
227
|
assert(dataDir, "Missing argument: data directory");
|
|
228
|
+
const sessions = getSessionsFromStart(options["fromSession"]);
|
|
234
229
|
console.time("documents processing time");
|
|
235
230
|
if (isOptionEmptyOrHasValue(options["types"], "textes")) {
|
|
236
|
-
await retrieveTextes(dataDir);
|
|
231
|
+
await retrieveTextes(dataDir, sessions);
|
|
237
232
|
}
|
|
238
233
|
if (isOptionEmptyOrHasValue(options["types"], "rapports")) {
|
|
239
|
-
await retrieveRapports(dataDir);
|
|
234
|
+
await retrieveRapports(dataDir, sessions);
|
|
240
235
|
}
|
|
241
236
|
if (!options["silent"]) {
|
|
242
237
|
console.timeEnd("documents processing time");
|
|
@@ -10,7 +10,7 @@ import readline from "readline";
|
|
|
10
10
|
// import util from "util"
|
|
11
11
|
import windows1252 from "windows-1252";
|
|
12
12
|
import config from "../config";
|
|
13
|
-
import { datasets,
|
|
13
|
+
import { datasets, getChosenDatasets, getEnabledDatasets } from "../datasets";
|
|
14
14
|
import { commonOptions } from "./shared/cli_helpers";
|
|
15
15
|
const badWindows1252CharacterRegex = /[\u0080-\u009f]/g;
|
|
16
16
|
const optionsDefinitions = [
|
|
@@ -201,8 +201,9 @@ async function retrieveOpenData() {
|
|
|
201
201
|
encoding: "utf-8",
|
|
202
202
|
});
|
|
203
203
|
}
|
|
204
|
-
const
|
|
205
|
-
|
|
204
|
+
const enabledDatasets = getEnabledDatasets(options["categories"]);
|
|
205
|
+
const chosenDatasets = getChosenDatasets(enabledDatasets);
|
|
206
|
+
for (const dataset of chosenDatasets) {
|
|
206
207
|
await retrieveDataset(dataDir, dataset);
|
|
207
208
|
}
|
|
208
209
|
if (!options["silent"]) {
|
|
@@ -12,6 +12,12 @@ export declare const dataDirDefaultOption: {
|
|
|
12
12
|
name: string;
|
|
13
13
|
type: StringConstructor;
|
|
14
14
|
};
|
|
15
|
+
export declare const fromSessionOption: {
|
|
16
|
+
defaultValue: number;
|
|
17
|
+
help: string;
|
|
18
|
+
name: string;
|
|
19
|
+
type: NumberConstructor;
|
|
20
|
+
};
|
|
15
21
|
export declare const silentOption: {
|
|
16
22
|
alias: string;
|
|
17
23
|
help: string;
|
|
@@ -36,6 +42,11 @@ export declare const commonOptions: ({
|
|
|
36
42
|
help: string;
|
|
37
43
|
name: string;
|
|
38
44
|
type: StringConstructor;
|
|
45
|
+
} | {
|
|
46
|
+
defaultValue: number;
|
|
47
|
+
help: string;
|
|
48
|
+
name: string;
|
|
49
|
+
type: NumberConstructor;
|
|
39
50
|
} | {
|
|
40
51
|
alias: string;
|
|
41
52
|
help: string;
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
export const categoriesOption = {
|
|
2
2
|
alias: "k",
|
|
3
3
|
defaultValue: ["All"],
|
|
4
|
-
help: "categories of datasets to reorganize",
|
|
4
|
+
help: "categories of datasets to reorganize; default All",
|
|
5
5
|
multiple: true,
|
|
6
6
|
name: "categories",
|
|
7
7
|
type: String,
|
|
@@ -12,6 +12,12 @@ export const dataDirDefaultOption = {
|
|
|
12
12
|
name: "dataDir",
|
|
13
13
|
type: String,
|
|
14
14
|
};
|
|
15
|
+
export const fromSessionOption = {
|
|
16
|
+
defaultValue: 2023,
|
|
17
|
+
help: "session year to retrieve data from; default 2023",
|
|
18
|
+
name: "fromSession",
|
|
19
|
+
type: Number,
|
|
20
|
+
};
|
|
15
21
|
export const silentOption = {
|
|
16
22
|
alias: "s",
|
|
17
23
|
help: "don't log anything",
|
|
@@ -27,6 +33,7 @@ export const verboseOption = {
|
|
|
27
33
|
export const commonOptions = [
|
|
28
34
|
categoriesOption,
|
|
29
35
|
dataDirDefaultOption,
|
|
36
|
+
fromSessionOption,
|
|
30
37
|
silentOption,
|
|
31
38
|
verboseOption,
|
|
32
39
|
];
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
export interface AgendaEvent {
|
|
2
|
+
type: string | null;
|
|
3
|
+
heureDebut: string | null;
|
|
4
|
+
heureFin: string | null;
|
|
5
|
+
timeOriginal: string | null;
|
|
6
|
+
titre: string | null;
|
|
7
|
+
organe: string | null;
|
|
8
|
+
objet: string | null;
|
|
9
|
+
lieu: string | null;
|
|
10
|
+
url_dossier_senat: string | null;
|
|
11
|
+
url_video: string | null;
|
|
12
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
package/lib/types/sessions.d.ts
CHANGED
|
@@ -1,42 +1,5 @@
|
|
|
1
|
-
export declare
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
"2016-2017" = 2016,
|
|
7
|
-
"2017-2018" = 2017,
|
|
8
|
-
"2018-2019" = 2018,
|
|
9
|
-
"2019-2020" = 2019,
|
|
10
|
-
"2020-2021" = 2020,
|
|
11
|
-
"2021-2022" = 2021,
|
|
12
|
-
"2022-2023" = 2022,
|
|
13
|
-
"2023-2024" = 2023,
|
|
14
|
-
"2024-2025" = 2024,
|
|
15
|
-
"2025-2026" = 2025,
|
|
16
|
-
"2026-2027" = 2026,
|
|
17
|
-
"2027-2028" = 2027,
|
|
18
|
-
"2028-2029" = 2028,
|
|
19
|
-
"2029-2030" = 2029,
|
|
20
|
-
"2030-2031" = 2030,
|
|
21
|
-
"2031-2032" = 2031,
|
|
22
|
-
"2032-2033" = 2032,
|
|
23
|
-
"2033-2034" = 2033,
|
|
24
|
-
"2034-2035" = 2034,
|
|
25
|
-
"2035-2036" = 2035,
|
|
26
|
-
"2036-2037" = 2036,
|
|
27
|
-
"2037-2038" = 2037,
|
|
28
|
-
"2038-2039" = 2038,
|
|
29
|
-
"2039-2040" = 2039,
|
|
30
|
-
"2040-2041" = 2040,
|
|
31
|
-
"2041-2042" = 2041,
|
|
32
|
-
"2042-2043" = 2042,
|
|
33
|
-
"2043-2044" = 2043,
|
|
34
|
-
"2044-2045" = 2044,
|
|
35
|
-
"2045-2046" = 2045,
|
|
36
|
-
"2046-2047" = 2046,
|
|
37
|
-
"2047-2048" = 2047,
|
|
38
|
-
"2048-2049" = 2048,
|
|
39
|
-
"2049-2050" = 2049,
|
|
40
|
-
"2050-2051" = 2050,
|
|
41
|
-
"All" = 0
|
|
42
|
-
}
|
|
1
|
+
export declare const UNDEFINED_SESSION = 0;
|
|
2
|
+
declare const sessions: readonly [0, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025, 2026];
|
|
3
|
+
export type Session = typeof sessions[number];
|
|
4
|
+
export declare function getSessionsFromStart(startSession: Session): (0 | 1958 | 1959 | 1960 | 1961 | 1962 | 1963 | 1964 | 1965 | 1966 | 1967 | 1968 | 1969 | 1970 | 1971 | 1972 | 1973 | 1974 | 1975 | 1976 | 1977 | 1978 | 1979 | 1980 | 1981 | 1982 | 1983 | 1984 | 1985 | 1986 | 1987 | 1988 | 1989 | 1990 | 1991 | 1992 | 1993 | 1994 | 1995 | 1996 | 1997 | 1998 | 1999 | 2000 | 2001 | 2002 | 2003 | 2004 | 2005 | 2006 | 2007 | 2008 | 2009 | 2010 | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | 2017 | 2018 | 2019 | 2020 | 2021 | 2022 | 2023 | 2024 | 2025 | 2026)[];
|
|
5
|
+
export {};
|
package/lib/types/sessions.js
CHANGED
|
@@ -1,43 +1,84 @@
|
|
|
1
|
-
export
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
1
|
+
export const UNDEFINED_SESSION = 0;
|
|
2
|
+
const sessions = [
|
|
3
|
+
UNDEFINED_SESSION,
|
|
4
|
+
1958,
|
|
5
|
+
1959,
|
|
6
|
+
1960,
|
|
7
|
+
1961,
|
|
8
|
+
1962,
|
|
9
|
+
1963,
|
|
10
|
+
1964,
|
|
11
|
+
1965,
|
|
12
|
+
1966,
|
|
13
|
+
1967,
|
|
14
|
+
1968,
|
|
15
|
+
1969,
|
|
16
|
+
1970,
|
|
17
|
+
1971,
|
|
18
|
+
1972,
|
|
19
|
+
1973,
|
|
20
|
+
1974,
|
|
21
|
+
1975,
|
|
22
|
+
1976,
|
|
23
|
+
1977,
|
|
24
|
+
1978,
|
|
25
|
+
1979,
|
|
26
|
+
1980,
|
|
27
|
+
1981,
|
|
28
|
+
1982,
|
|
29
|
+
1983,
|
|
30
|
+
1984,
|
|
31
|
+
1985,
|
|
32
|
+
1986,
|
|
33
|
+
1987,
|
|
34
|
+
1988,
|
|
35
|
+
1989,
|
|
36
|
+
1990,
|
|
37
|
+
1991,
|
|
38
|
+
1992,
|
|
39
|
+
1993,
|
|
40
|
+
1994,
|
|
41
|
+
1995,
|
|
42
|
+
1996,
|
|
43
|
+
1997,
|
|
44
|
+
1998,
|
|
45
|
+
1999,
|
|
46
|
+
2000,
|
|
47
|
+
2001,
|
|
48
|
+
2002,
|
|
49
|
+
2003,
|
|
50
|
+
2004,
|
|
51
|
+
2005,
|
|
52
|
+
2006,
|
|
53
|
+
2007,
|
|
54
|
+
2008,
|
|
55
|
+
2009,
|
|
56
|
+
2010,
|
|
57
|
+
2011,
|
|
58
|
+
2012,
|
|
59
|
+
2013,
|
|
60
|
+
2014,
|
|
61
|
+
2015,
|
|
62
|
+
2016,
|
|
63
|
+
2017,
|
|
64
|
+
2018,
|
|
65
|
+
2019,
|
|
66
|
+
2020,
|
|
67
|
+
2021,
|
|
68
|
+
2022,
|
|
69
|
+
2023,
|
|
70
|
+
2024,
|
|
71
|
+
2025,
|
|
72
|
+
2026,
|
|
73
|
+
// TO COMPLETE EVERY YEAR :)
|
|
74
|
+
];
|
|
75
|
+
export function getSessionsFromStart(startSession) {
|
|
76
|
+
if (startSession === 0) {
|
|
77
|
+
return Array.from(sessions);
|
|
78
|
+
}
|
|
79
|
+
const sessionIndex = sessions.findIndex(session => startSession === session);
|
|
80
|
+
if (sessionIndex >= 0) {
|
|
81
|
+
return sessions.slice(sessionIndex);
|
|
82
|
+
}
|
|
83
|
+
return [];
|
|
84
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tricoteuses/senat",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "2.1.1",
|
|
4
4
|
"description": "Handle French Sénat's open data",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"France",
|
|
@@ -45,9 +45,10 @@
|
|
|
45
45
|
"data:convert_data": "tsx src/scripts/convert_data.ts",
|
|
46
46
|
"data:download": "bash -c 'npm run data:retrieve_open_data -- $@ && npm run data:convert_data -- $@' bash",
|
|
47
47
|
"data:generate_schemas": "tsx src/scripts/retrieve_open_data.ts --schema",
|
|
48
|
+
"data:retrieve_agenda": "tsx src/scripts/retrieve_agenda.ts",
|
|
49
|
+
"data:retrieve_documents": "tsx src/scripts/retrieve_documents.ts",
|
|
48
50
|
"data:retrieve_open_data": "tsx src/scripts/retrieve_open_data.ts --all",
|
|
49
51
|
"data:retrieve_senateurs_photos": "tsx src/scripts/retrieve_senateurs_photos.ts --fetch",
|
|
50
|
-
"data:retrieve_documents": "tsx src/scripts/retrieve_documents.ts",
|
|
51
52
|
"data:parse_textes_lois": "tsx src/scripts/parse_textes.ts",
|
|
52
53
|
"prepare": "npm run build",
|
|
53
54
|
"prepublishOnly": "npm run build",
|