@tricoteuses/senat 2.1.2 → 2.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -54,8 +54,8 @@ npm run data:retrieve_documents ../senat-data -- --fromSession 2023 --parseDocum
54
54
  # Parsing only
55
55
  npm run data:parse_textes_lois ../senat-data
56
56
 
57
- # Retrieval of agenda from Sénat's website
58
- npm run data:retrieve_agenda ../senat-data -- --fromSession 2023
57
+ # Retrieval (& parsing) of agenda from Sénat's website
58
+ npm run data:retrieve_agenda ../senat-data -- --fromSession 2023 [--parseAgenda]
59
59
 
60
60
  # Retrieval of sénateurs' pictures from Sénat's website
61
61
  npm run data:retrieve_senateurs_photos ../senat-data
package/lib/index.d.ts CHANGED
@@ -2,6 +2,7 @@ export type { AmendementResult, } from "./model/ameli";
2
2
  export type { DossierLegislatifResult, } from "./model/dosleg";
3
3
  export type { QuestionResult, } from "./model/questions";
4
4
  export type { CirconscriptionResult, OrganismeResult, SenateurResult, } from "./model/sens";
5
+ export type { AgendaEvent } from "./types/agenda";
5
6
  export type { Ses, Sub, TxtAmeli } from "./types/ameli";
6
7
  export type { Debat, LecAssDeb } from "./types/debats";
7
8
  export type { Ass, Aud, Auteur, DateSeance, DecCoc, DenRap, DocAtt, Ecr, EtaLoi, LecAss, LecAssRap, Lecture, Loi, Org, OriTxt, Qua, Rap, RapOrg, Scr, Texte, TypAtt, TypLec, TypLoi, TypTxt, TypUrl, } from "./types/dosleg";
package/lib/loaders.d.ts CHANGED
@@ -2,6 +2,7 @@ import { AmendementResult } from "./model/ameli";
2
2
  import { DossierLegislatifResult } from "./model/dosleg";
3
3
  import { QuestionResult } from "./model/questions";
4
4
  import { CirconscriptionResult, OrganismeResult, SenateurResult } from "./model/sens";
5
+ import { AgendaEvent } from "./types/agenda";
5
6
  import { FlatTexte } from "./types/texte";
6
7
  export { EnabledDatasets } from "./datasets";
7
8
  export declare const AGENDA_FOLDER = "agenda";
@@ -59,7 +60,8 @@ export declare function iterLoadSenatDossiersLegislatifsTexteUrls(dataDir: strin
59
60
  export declare function iterLoadSenatDossiersLegislatifsRapportUrls(dataDir: string, session: number | undefined): Generator<IterItem<RapportMetadata>>;
60
61
  export declare function iterLoadSenatDossiersLegislatifsTextes(dataDir: string, session: number | undefined, options?: {}): Generator<IterItem<DossierLegislatifTexteResult>>;
61
62
  export declare function loadSenatTexteContent(dataDir: string, textePathFromDataset: string): IterItem<FlatTexte | null>;
63
+ export declare function iterLoadSenatEvenements(dataDir: string, session: number | undefined, options?: {}): Generator<IterItem<AgendaEvent>>;
64
+ export declare function iterLoadSenatCirconscriptions(dataDir: string, options?: {}): Generator<IterItem<CirconscriptionResult>>;
62
65
  export declare function iterLoadSenatOrganismes(dataDir: string, options?: {}): Generator<IterItem<OrganismeResult>>;
63
66
  export declare function iterLoadSenatSenateurs(dataDir: string, legislature: number, options?: {}): Generator<IterItem<SenateurResult>>;
64
- export declare function iterLoadSenatCirconscriptions(dataDir: string, options?: {}): Generator<IterItem<CirconscriptionResult>>;
65
67
  export declare function iterLoadSenatQuestions(dataDir: string, legislature: number, options?: {}): Generator<IterItem<QuestionResult>>;
package/lib/loaders.js CHANGED
@@ -27,8 +27,8 @@ export function* iterFilePaths(dirPath) {
27
27
  }
28
28
  }
29
29
  }
30
- function* iterLoadSenatItems(dataDir, dataset, legislatureOrSession, subDir, { log = false } = {}) {
31
- let itemsDir = path.join(dataDir, dataset.database);
30
+ function* iterLoadSenatItems(dataDir, dataName, legislatureOrSession, subDir, { log = false } = {}) {
31
+ let itemsDir = path.join(dataDir, dataName);
32
32
  if (subDir) {
33
33
  itemsDir = path.join(itemsDir, subDir);
34
34
  }
@@ -41,7 +41,7 @@ function* iterLoadSenatItems(dataDir, dataset, legislatureOrSession, subDir, { l
41
41
  }
42
42
  const itemJson = fs.readFileSync(filePath, { encoding: "utf8" });
43
43
  const item = JSON.parse(itemJson);
44
- const filePathFromDataset = filePath.substring(filePath.indexOf(dataset.database) + dataset.database.length);
44
+ const filePathFromDataset = filePath.substring(filePath.indexOf(dataName) + dataName.length);
45
45
  yield {
46
46
  item,
47
47
  filePathFromDataset,
@@ -50,12 +50,12 @@ function* iterLoadSenatItems(dataDir, dataset, legislatureOrSession, subDir, { l
50
50
  }
51
51
  }
52
52
  export function* iterLoadSenatAmendements(dataDir, session, options = {}) {
53
- for (const amendementItem of iterLoadSenatItems(dataDir, datasets.ameli, session, undefined, options)) {
53
+ for (const amendementItem of iterLoadSenatItems(dataDir, datasets.ameli.database, session, undefined, options)) {
54
54
  yield amendementItem;
55
55
  }
56
56
  }
57
57
  export function* iterLoadSenatDossiersLegislatifs(dataDir, session, options = {}) {
58
- for (const dossierLegislatifItem of iterLoadSenatItems(dataDir, datasets.dosleg, session, DOSLEG_DOSSIERS_FOLDER, options)) {
58
+ for (const dossierLegislatifItem of iterLoadSenatItems(dataDir, datasets.dosleg.database, session, DOSLEG_DOSSIERS_FOLDER, options)) {
59
59
  yield dossierLegislatifItem;
60
60
  }
61
61
  }
@@ -129,15 +129,27 @@ export function loadSenatTexteContent(dataDir, textePathFromDataset) {
129
129
  const texteJson = fs.readFileSync(fullTextePath, { encoding: "utf8" });
130
130
  return { item: JSON.parse(texteJson) };
131
131
  }
132
+ export function* iterLoadSenatEvenements(dataDir, session, options = {}) {
133
+ for (const evenementsItem of iterLoadSenatItems(dataDir, AGENDA_FOLDER, session, DATA_TRANSFORMED_FOLDER, options)) {
134
+ for (const evenement of evenementsItem.item) {
135
+ yield { item: evenement };
136
+ }
137
+ }
138
+ }
139
+ export function* iterLoadSenatCirconscriptions(dataDir, options = {}) {
140
+ for (const circonscriptionItem of iterLoadSenatItems(dataDir, datasets.sens.database, undefined, SENS_CIRCONSCRIPTIONS_FOLDER, options)) {
141
+ yield circonscriptionItem;
142
+ }
143
+ }
132
144
  export function* iterLoadSenatOrganismes(dataDir, options = {}) {
133
- for (const organismeItem of iterLoadSenatItems(dataDir, datasets.sens, undefined, SENS_ORGANISMES_FOLDER, options)) {
145
+ for (const organismeItem of iterLoadSenatItems(dataDir, datasets.sens.database, undefined, SENS_ORGANISMES_FOLDER, options)) {
134
146
  yield organismeItem;
135
147
  }
136
148
  }
137
149
  export function* iterLoadSenatSenateurs(dataDir, legislature, options = {}) {
138
150
  const dateDebutLegislatureStr = legislatures.find((legislatureInfo) => legislatureInfo.numero === legislature)?.date_debut;
139
151
  const dateDebutLegislature = new Date(dateDebutLegislatureStr);
140
- for (const senateurItem of iterLoadSenatItems(dataDir, datasets.sens, undefined, SENS_SENATEURS_FOLDER, options)) {
152
+ for (const senateurItem of iterLoadSenatItems(dataDir, datasets.sens.database, undefined, SENS_SENATEURS_FOLDER, options)) {
141
153
  const dateFinMandatSenateur = senateurItem.item.mandats_senateur[0]
142
154
  ?.date_fin
143
155
  ? new Date(senateurItem.item.mandats_senateur[0]?.date_fin)
@@ -148,13 +160,8 @@ export function* iterLoadSenatSenateurs(dataDir, legislature, options = {}) {
148
160
  yield senateurItem;
149
161
  }
150
162
  }
151
- export function* iterLoadSenatCirconscriptions(dataDir, options = {}) {
152
- for (const circonscriptionItem of iterLoadSenatItems(dataDir, datasets.sens, undefined, SENS_CIRCONSCRIPTIONS_FOLDER, options)) {
153
- yield circonscriptionItem;
154
- }
155
- }
156
163
  export function* iterLoadSenatQuestions(dataDir, legislature, options = {}) {
157
- for (const questionItem of iterLoadSenatItems(dataDir, datasets.questions, legislature, undefined, options)) {
164
+ for (const questionItem of iterLoadSenatItems(dataDir, datasets.questions.database, legislature, undefined, options)) {
158
165
  yield questionItem;
159
166
  }
160
167
  }
@@ -1,5 +1,6 @@
1
1
  import { JSDOM } from "jsdom";
2
2
  import { DateTime } from "luxon";
3
+ import path from "path";
3
4
  function getEventType(eventClasses) {
4
5
  const typeClass = [...eventClasses]
5
6
  .find(className => className.startsWith("evt-"))
@@ -23,52 +24,68 @@ function getUrlDossierSenat(lienElements) {
23
24
  .find(lienElement => lienElement.textContent?.includes("dossier législatif"));
24
25
  return urlElement ? urlElement.getAttribute("href") : null;
25
26
  }
26
- function getHeuresDebutFin(timeStr) {
27
- const normalizedHeureDebut = timeStr
28
- ?.replace(/^À l'issue de l'espace réservé .* et au plus tard\s/i, "") // Must be first
29
- ?.replace(/^(?:le )?matin/i, "10h00")
30
- ?.replace(/^(?:l')?après-midi/i, "16h00")
31
- ?.replace(/^(?:le )?soir/i, "20h00")
32
- ?.replace(/^(?:la )?nuit/i, "22h00")
27
+ /**
28
+ * Normalize time string to become a simple start time ("H'h'mm") or a duration ("'de 'H'h'mm' à 'H'h'mm").
29
+ */
30
+ function normalizeTime(timeStr) {
31
+ return timeStr
32
+ ?.replace(/^À l'issue de l'espace réservé .* et au plus tard\s/i, "") // Must be processed first
33
+ ?.replace(/^(?:le )?matin/i, "10h00") // We chose "matin" to mean 10h00
34
+ ?.replace(/^(?:l')?après-midi/i, "16h00") // We chose "après-midi" to mean 16h00
35
+ ?.replace(/^(?:le )?soir/i, "20h00") // We chose "soir" to mean 20h00
36
+ ?.replace(/^(?:la )?nuit/i, "22h00") // We chose "nuit" to mean 22h00
33
37
  ?.replace(/^à\s/ig, "")
34
38
  ?.replace(/heures/ig, "h00")
35
39
  ?.replace(/\set.*/i, "")
36
40
  ?.replace(/,.*/, "")
37
41
  ?.replace(/\s\(hors hémicycle\)/i, "")
38
42
  ?.replace(/\s*h\s*/ig, "h");
39
- console.dir(`${timeStr};${normalizedHeureDebut}`);
40
- const heureDebut = timeStr
41
- ? DateTime.fromFormat(timeStr, "H'h'mm").toISOTime()
42
- : null;
43
+ }
44
+ function getStartAndEndTimes(timeStr) {
45
+ const normalizedTime = normalizeTime(timeStr);
46
+ const timeMatches = normalizedTime
47
+ ?.match(/^de (?<startTime>\d{2}h\d{2}) à (?<endTime>\d{2}h\d{2})$/i);
48
+ if (timeMatches?.groups) {
49
+ const { startTime, endTime } = timeMatches.groups;
50
+ return {
51
+ startTime: startTime ? DateTime.fromFormat(startTime, "H'h'mm").toISOTime() : null,
52
+ endTime: endTime ? DateTime.fromFormat(endTime, "H'h'mm").toISOTime() : null,
53
+ };
54
+ }
43
55
  return {
44
- heureDebut,
45
- heureFin: null
56
+ startTime: normalizedTime ? DateTime.fromFormat(normalizedTime, "H'h'mm").toISOTime() : null,
57
+ endTime: null,
46
58
  };
47
59
  }
48
- function transformAgenda(document) {
60
+ function transformAgenda(document, fileName) {
49
61
  const agendaEvents = [];
50
62
  const eventElements = document.querySelectorAll(".evt");
51
63
  for (const eventElement of eventElements) {
64
+ const id = eventElement.previousElementSibling?.getAttribute("name") || null;
65
+ if (!id) {
66
+ continue;
67
+ }
52
68
  const type = getEventType(eventElement.classList);
69
+ const date = DateTime.fromFormat(fileName, "yyyyMMdd").toFormat("yyyy-MM-dd");
53
70
  const timeOriginal = eventElement.querySelector(".time")?.textContent || null;
54
- const { heureDebut, heureFin } = getHeuresDebutFin(timeOriginal);
55
- const titre = eventElement.querySelector(".titre")?.textContent || null;
56
- const organe = eventElement.querySelector(".organe")?.textContent || null;
57
- const objet = eventElement.querySelector(".objet")?.textContent || null;
71
+ const { startTime, endTime } = getStartAndEndTimes(timeOriginal);
72
+ const titre = eventElement.querySelector(".titre")?.textContent?.trim() || null;
73
+ const organe = eventElement.querySelector(".organe")?.textContent?.trim() || null;
74
+ const objet = eventElement.querySelector(".objet")?.textContent?.trim() || null;
58
75
  const lieu = eventElement.querySelector(".lieu")?.textContent || null;
59
76
  const url_dossier_senat = getUrlDossierSenat(eventElement.querySelectorAll(".lien a"));
60
- const url_video = eventElement.querySelector(".video a")?.getAttribute("href") || null;
61
77
  agendaEvents.push({
78
+ id,
62
79
  type,
63
- heureDebut,
64
- heureFin,
80
+ date,
81
+ startTime,
82
+ endTime,
65
83
  timeOriginal,
66
84
  titre,
67
85
  organe,
68
86
  objet,
69
87
  lieu,
70
88
  url_dossier_senat,
71
- url_video,
72
89
  });
73
90
  }
74
91
  return agendaEvents;
@@ -76,7 +93,8 @@ function transformAgenda(document) {
76
93
  export async function parseAgendaFromFile(htmlFilePath) {
77
94
  try {
78
95
  const { document } = (await JSDOM.fromFile(htmlFilePath, { contentType: "text/html" })).window;
79
- return transformAgenda(document);
96
+ const fileName = path.parse(htmlFilePath).name;
97
+ return transformAgenda(document, fileName);
80
98
  }
81
99
  catch (error) {
82
100
  console.error(`Could not parse texte with error ${error}`);
@@ -3,35 +3,46 @@ import commandLineArgs from "command-line-args";
3
3
  import fs from "fs-extra";
4
4
  import { DateTime } from "luxon";
5
5
  import path from "path";
6
- import { AGENDA_FOLDER, DATA_ORIGINAL_FOLDER } from "../loaders";
6
+ import { AGENDA_FOLDER, DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER } from "../loaders";
7
+ import { parseAgendaFromFile } from "../model/agenda";
7
8
  import { getSessionsFromStart } from "../types/sessions";
8
9
  import { commonOptions } from "./shared/cli_helpers";
9
10
  import { ensureAndClearDir } from "./shared/util";
10
11
  const optionsDefinitions = [
11
12
  ...commonOptions,
12
13
  {
13
- help: "parse and convert documents into JSON (textes only for now, requires format xml)",
14
- name: "parseDocuments",
14
+ help: "parse and convert agenda events into JSON",
15
+ name: "parseAgenda",
15
16
  type: Boolean,
16
17
  },
17
18
  ];
18
19
  const options = commandLineArgs(optionsDefinitions);
19
20
  const SENAT_GLOBAL_AGENDA_URL_ROOT = "https://www.senat.fr/aglae/Global";
20
- async function retrieveAgenda(dataDir, sessions) {
21
+ async function retrieveAgendas(dataDir, sessions) {
21
22
  const agendaRootDir = path.join(dataDir, AGENDA_FOLDER);
22
23
  ensureAndClearDir(agendaRootDir);
23
24
  const originalAgendaDir = path.join(agendaRootDir, DATA_ORIGINAL_FOLDER);
24
25
  fs.ensureDirSync(originalAgendaDir);
26
+ const transformedAgendaDir = path.join(agendaRootDir, DATA_TRANSFORMED_FOLDER);
27
+ if (options["parseAgenda"]) {
28
+ fs.ensureDirSync(transformedAgendaDir);
29
+ }
25
30
  for (const session of sessions) {
26
31
  if (!options["silent"]) {
27
32
  console.log(`Retrieving Agenda for session ${session}…`);
28
33
  }
29
- const agendaSessionDir = path.join(originalAgendaDir, `${session}`);
30
- fs.ensureDirSync(agendaSessionDir);
34
+ const originalAgendaSessionDir = path.join(originalAgendaDir, `${session}`);
35
+ fs.ensureDirSync(originalAgendaSessionDir);
36
+ const transformedAgendaSessionDir = path.join(transformedAgendaDir, `${session}`);
37
+ if (options["parseAgenda"]) {
38
+ fs.ensureDirSync(transformedAgendaSessionDir);
39
+ }
31
40
  const fifteenDaysFromNow = new Date();
32
41
  fifteenDaysFromNow.setDate(fifteenDaysFromNow.getDate() + 15);
33
42
  for (const date = new Date(session, 0, 1); date <= new Date(session, 11, 31) && date <= fifteenDaysFromNow; date.setDate(date.getDate() + 1)) {
34
43
  const agendaName = DateTime.fromJSDate(date).toFormat("ddMMyyyy");
44
+ const agendaFileName = DateTime.fromJSDate(date).toFormat("yyyyMMdd");
45
+ const agendaPath = path.join(originalAgendaSessionDir, agendaFileName);
35
46
  try {
36
47
  const response = await fetch(`${SENAT_GLOBAL_AGENDA_URL_ROOT}/agl${agendaName}.html`);
37
48
  if (!response.ok) {
@@ -47,11 +58,18 @@ async function retrieveAgenda(dataDir, sessions) {
47
58
  if (!agendaContent) {
48
59
  return;
49
60
  }
50
- fs.writeFileSync(path.join(agendaSessionDir, agendaName), Buffer.from(agendaContent));
61
+ fs.writeFileSync(agendaPath, Buffer.from(agendaContent));
51
62
  }
52
63
  catch (error) {
53
64
  console.error(error.message);
54
65
  }
66
+ if (options["parseAgenda"]) {
67
+ const parsedAgendaEvents = await parseAgendaFromFile(agendaPath);
68
+ if (!parsedAgendaEvents || parsedAgendaEvents.length === 0) {
69
+ continue;
70
+ }
71
+ fs.writeJSONSync(path.join(transformedAgendaSessionDir, `${agendaFileName}.json`), parsedAgendaEvents, { spaces: 2 });
72
+ }
55
73
  }
56
74
  }
57
75
  }
@@ -60,7 +78,7 @@ async function main() {
60
78
  assert(dataDir, "Missing argument: data directory");
61
79
  const sessions = getSessionsFromStart(options["fromSession"]);
62
80
  console.time("agenda processing time");
63
- await retrieveAgenda(dataDir, sessions);
81
+ await retrieveAgendas(dataDir, sessions);
64
82
  if (!options["silent"]) {
65
83
  console.timeEnd("agenda processing time");
66
84
  }
@@ -1,12 +1,13 @@
1
1
  export interface AgendaEvent {
2
+ id: string;
2
3
  type: string | null;
3
- heureDebut: string | null;
4
- heureFin: string | null;
4
+ date: string | null;
5
+ startTime: string | null;
6
+ endTime: string | null;
5
7
  timeOriginal: string | null;
6
8
  titre: string | null;
7
9
  organe: string | null;
8
10
  objet: string | null;
9
11
  lieu: string | null;
10
12
  url_dossier_senat: string | null;
11
- url_video: string | null;
12
13
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tricoteuses/senat",
3
- "version": "2.1.2",
3
+ "version": "2.2.1",
4
4
  "description": "Handle French Sénat's open data",
5
5
  "keywords": [
6
6
  "France",