@transportme/vline-nsp-reader 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/constants.mjs +4 -0
- package/lib/nsp-pdf-reader.mjs +153 -0
- package/lib/nsp-utils.mjs +33 -0
- package/lib/table-reader.mjs +98 -0
- package/lib/vline-nsp.mjs +96 -0
- package/lib.mjs +7 -0
- package/package.json +23 -0
- package/read-pdf.mjs +12 -0
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
import { getOperationDays } from './nsp-utils.mjs'
|
|
2
|
+
import TableReader from './table-reader.mjs'
|
|
3
|
+
|
|
4
|
+
export default class NSPPDFReader {
|
|
5
|
+
|
|
6
|
+
#filePath
|
|
7
|
+
#pages
|
|
8
|
+
|
|
9
|
+
#stations = []
|
|
10
|
+
|
|
11
|
+
constructor(filePath) {
|
|
12
|
+
this.#filePath = filePath
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
getHeaderIndex(pageNum) {
|
|
16
|
+
return this.#pages[pageNum].findIndex(row => row[0] === 'Train Movement Type')
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
getHeader(pageNum) {
|
|
20
|
+
let headerIndex = this.getHeaderIndex(pageNum)
|
|
21
|
+
return this.#pages[pageNum].slice(0, headerIndex + 1)
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
getBody(pageNum) {
|
|
25
|
+
let headerIndex = this.getHeaderIndex(pageNum)
|
|
26
|
+
return this.#pages[pageNum].slice(headerIndex + 2)
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
getStations(pageNum) {
|
|
30
|
+
if (this.#stations[pageNum]) return this.#stations[pageNum]
|
|
31
|
+
let body = this.getBody(pageNum)
|
|
32
|
+
let stationList = body.map(row => row[0])
|
|
33
|
+
|
|
34
|
+
let output = []
|
|
35
|
+
let lastStation = stationList[0]
|
|
36
|
+
for (let i = 0; i < stationList.length; i++) {
|
|
37
|
+
if (stationList[i]) {
|
|
38
|
+
lastStation = stationList[i]
|
|
39
|
+
output.push(stationList[i])
|
|
40
|
+
} else {
|
|
41
|
+
output.push(lastStation)
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
this.#stations[pageNum] = output
|
|
46
|
+
return output
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
async read() {
|
|
50
|
+
let tableReader = new TableReader(this.#filePath)
|
|
51
|
+
let rawPages = await tableReader.read()
|
|
52
|
+
|
|
53
|
+
let pages = []
|
|
54
|
+
for (let page of rawPages) {
|
|
55
|
+
let header = page.slice(0, 4)
|
|
56
|
+
if (header.some(row => row[0] === 'Business ID')) pages.push(page)
|
|
57
|
+
else {
|
|
58
|
+
let colDiff = pages[pages.length - 1][0].length - page[0].length
|
|
59
|
+
if (colDiff > 0) for (let row of page) row.push(...Array(colDiff).fill(''))
|
|
60
|
+
pages[pages.length - 1].push(...page)
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
this.#pages = pages
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
__setPageData(data) { this.#pages = data }
|
|
68
|
+
|
|
69
|
+
processStation(station) {
|
|
70
|
+
if (station.depTime.endsWith('*')) {
|
|
71
|
+
station.express = true
|
|
72
|
+
station.depTime = station.depTime.slice(0, -1)
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
if (station.depTime && !station.arrTime) {
|
|
76
|
+
if (station.depTime.includes('/')) {
|
|
77
|
+
let [ arrTime, depTime ] = station.depTime.split('/')
|
|
78
|
+
station.arrTime = arrTime, station.depTime = depTime
|
|
79
|
+
} else {
|
|
80
|
+
station.arrTime = station.depTime
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
return station
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
getRuns(pageNum) {
|
|
88
|
+
let header = this.getHeader(pageNum)
|
|
89
|
+
let body = this.getBody(pageNum)
|
|
90
|
+
let stations = this.getStations(pageNum)
|
|
91
|
+
|
|
92
|
+
let runs = []
|
|
93
|
+
for (let runIndex = 2; runIndex < header[0].length; runIndex++) {
|
|
94
|
+
let runData = {
|
|
95
|
+
tdn: header[0][runIndex].slice(0, 4),
|
|
96
|
+
conditional: header[0][runIndex].length > 4,
|
|
97
|
+
daysRunCode: header[1][runIndex],
|
|
98
|
+
daysRun: getOperationDays(header[1][runIndex]),
|
|
99
|
+
operator: null, movementType: null,
|
|
100
|
+
formedBy: null, forming: null,
|
|
101
|
+
vehicleType: null, stations: []
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
for (let headerRow = 2; headerRow < header.length; headerRow++) {
|
|
105
|
+
let rowName = header[headerRow][0]
|
|
106
|
+
let rowData = header[headerRow][runIndex]
|
|
107
|
+
if (rowName === 'Operator') runData.operator = rowData
|
|
108
|
+
else if (rowName === 'Train Movement Type') runData.movementType = rowData
|
|
109
|
+
else if (rowName === 'Master Vehicle Formation') runData.vehicleType = rowData
|
|
110
|
+
else if (rowName === 'Formed By On Arrival') runData.formedBy = rowData
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
let lastStation = null
|
|
114
|
+
let lastStationName = null
|
|
115
|
+
for (let stationIndex = 0; stationIndex < stations.length - 1; stationIndex++) {
|
|
116
|
+
let currentStation = stations[stationIndex]
|
|
117
|
+
let stationData = body[stationIndex][runIndex]
|
|
118
|
+
if (!stationData) continue
|
|
119
|
+
if (lastStationName === currentStation) {
|
|
120
|
+
let amendmentType = body[stationIndex][1]
|
|
121
|
+
if (amendmentType === 'Plat') lastStation.plat = stationData
|
|
122
|
+
else if (amendmentType === 'Arr') lastStation.arrTime = stationData
|
|
123
|
+
else lastStation.track = stationData
|
|
124
|
+
} else {
|
|
125
|
+
if (lastStation) runData.stations.push(lastStation)
|
|
126
|
+
lastStation = {
|
|
127
|
+
name: currentStation,
|
|
128
|
+
arrTime: null,
|
|
129
|
+
depTime: stationData,
|
|
130
|
+
plat: null, track: null, express: false
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
lastStationName = currentStation
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
runData.stations.push(lastStation)
|
|
138
|
+
runData.stations = runData.stations.map(station => this.processStation(station))
|
|
139
|
+
let forming = body[stations.length - 1][runIndex].toUpperCase()
|
|
140
|
+
runData.forming = forming.length ? forming : null
|
|
141
|
+
runs.push(runData)
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
return runs
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
getAllRuns() {
|
|
148
|
+
let runs = []
|
|
149
|
+
for (let i = 0; i < this.#pages.length; i++) runs.push(...this.getRuns(i))
|
|
150
|
+
return runs
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
const daysOfWeek = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
|
|
2
|
+
const dayMapping = {
|
|
3
|
+
'M': 'Mon',
|
|
4
|
+
'Tu': 'Tue',
|
|
5
|
+
'W': 'Wed',
|
|
6
|
+
'Th': 'Thu',
|
|
7
|
+
'F': 'Fri',
|
|
8
|
+
'Sat': 'Sat',
|
|
9
|
+
'Sun': 'Sun',
|
|
10
|
+
'Su': 'Sun'
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
export function getOperationDayLetter(letters) {
|
|
14
|
+
return letters.match(/([A-Z][a-z]*)/g)
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export function getOperationDays(days) {
|
|
18
|
+
let parts = days.split('+')
|
|
19
|
+
|
|
20
|
+
if (parts.length === 1) {
|
|
21
|
+
if (days.endsWith('O')) {
|
|
22
|
+
return getOperationDayLetter(days.slice(0, -1)).map(code => dayMapping[code])
|
|
23
|
+
} else if (days.endsWith('E')) {
|
|
24
|
+
let excludedDays = getOperationDayLetter(days.slice(0, -1)).map(code => dayMapping[code])
|
|
25
|
+
return daysOfWeek.slice(0, -2).filter(day => !excludedDays.includes(day))
|
|
26
|
+
} else if (days === 'MF') return daysOfWeek.slice(0, -2)
|
|
27
|
+
else if (days === 'Daily') return daysOfWeek.slice(0)
|
|
28
|
+
else if (daysOfWeek.includes(days)) return [ days ]
|
|
29
|
+
return []
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
return parts.map(part => getOperationDays(part)).reduce((acc, e) => acc.concat(e), [])
|
|
33
|
+
}
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import PDFParser from 'pdf2json'
|
|
2
|
+
import fs from 'fs/promises'
|
|
3
|
+
|
|
4
|
+
export default class TableReader {
|
|
5
|
+
|
|
6
|
+
#file
|
|
7
|
+
|
|
8
|
+
constructor(file) {
|
|
9
|
+
this.#file = file
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
parserCallback(data) {
|
|
13
|
+
// PDF's contain pages and each page contains Texts. These texts have an x and y value.
|
|
14
|
+
// So finding Texts with equal y values seems like the solution.
|
|
15
|
+
// However, some y values are off by 0.010 pixels/points so let's first find what the smallest y value could be.
|
|
16
|
+
|
|
17
|
+
// Let's find Texts with the same x value and look for the smallest y distance of these Texts (on the same page of course)
|
|
18
|
+
// Then use those smallest y values (per page) to find Texts that seem to be on the same row
|
|
19
|
+
// If no smallest y value (per page) can be found, use 0 as smallest distance.
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
// now lets find Texts with 'the same' y-values, Actually y-values in the range of y-smallestYValue and y+smallestYValue:
|
|
23
|
+
|
|
24
|
+
let pages = data.Pages.map(page => {
|
|
25
|
+
let fills = page.Fills
|
|
26
|
+
let colStarts = fills.filter(fill => {
|
|
27
|
+
return fill.h > fill.w
|
|
28
|
+
}).map(fill => fill.x).filter((e, i, a) => a.indexOf(e) === i).sort((a, b) => a - b)
|
|
29
|
+
|
|
30
|
+
let rowStarts = fills.filter(fill => {
|
|
31
|
+
return fill.h < fill.w
|
|
32
|
+
}).map(fill => fill.y).filter((e, i, a) => a.indexOf(e) === i).sort((a, b) => a - b)
|
|
33
|
+
|
|
34
|
+
let pageData = []
|
|
35
|
+
let smallTable = rowStarts.length === 2
|
|
36
|
+
|
|
37
|
+
page.Texts.forEach(text => {
|
|
38
|
+
let textContent = decodeURIComponent(text.R[0].T)
|
|
39
|
+
if (textContent.match(/^(Effective|As at|Replaces)/)) return
|
|
40
|
+
|
|
41
|
+
let firstYGreater = rowStarts.find(r => r > text.y + 0.3)
|
|
42
|
+
let difference = firstYGreater - text.y
|
|
43
|
+
let currentRow = rowStarts.indexOf(firstYGreater) - 1
|
|
44
|
+
if (!smallTable && difference > 0.6) currentRow--
|
|
45
|
+
if (currentRow < 0) return
|
|
46
|
+
|
|
47
|
+
let currentCol = colStarts.findIndex(c => c > text.x + 0.4) - 1
|
|
48
|
+
|
|
49
|
+
if (!pageData[currentRow]) pageData[currentRow] = []
|
|
50
|
+
|
|
51
|
+
if (!pageData[currentRow][currentCol]) pageData[currentRow][currentCol] = textContent
|
|
52
|
+
else pageData[currentRow][currentCol] += ` ${textContent}`
|
|
53
|
+
})
|
|
54
|
+
|
|
55
|
+
for (let y = 0; y < pageData.length; y++) {
|
|
56
|
+
if (!pageData[y]) pageData[y] = []
|
|
57
|
+
for (let x = 0; x < pageData[y].length; x++) {
|
|
58
|
+
if (!pageData[y][x]) pageData[y][x] = ''
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
return pageData
|
|
62
|
+
})
|
|
63
|
+
|
|
64
|
+
return pages.map(page => {
|
|
65
|
+
let maxSize = Math.max(...page.map(row => row.length))
|
|
66
|
+
let blankCells = Array(maxSize).fill('')
|
|
67
|
+
|
|
68
|
+
return page.map(row => row.map(g => g.replace(/ +/g, ' ').trim()).concat(blankCells).slice(0, maxSize))
|
|
69
|
+
})
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
read() {
|
|
73
|
+
return new Promise(async (resolve, reject) => {
|
|
74
|
+
let pdfParser = new PDFParser()
|
|
75
|
+
|
|
76
|
+
pdfParser.on("pdfParser_dataReady", data => {
|
|
77
|
+
try {
|
|
78
|
+
resolve(this.parserCallback(data))
|
|
79
|
+
} catch (err) {
|
|
80
|
+
reject(err)
|
|
81
|
+
}
|
|
82
|
+
})
|
|
83
|
+
|
|
84
|
+
pdfParser.on("pdfParser_dataError", err => {
|
|
85
|
+
reject(err)
|
|
86
|
+
})
|
|
87
|
+
|
|
88
|
+
try {
|
|
89
|
+
let pdfBuffer = await fs.readFile(this.#file)
|
|
90
|
+
pdfParser.parseBuffer(pdfBuffer)
|
|
91
|
+
} catch (err) {
|
|
92
|
+
reject(err)
|
|
93
|
+
}
|
|
94
|
+
})
|
|
95
|
+
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
}
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import fetch from 'node-fetch';
|
|
2
|
+
import constants from './constants.mjs'
|
|
3
|
+
import { load as parseHTML } from 'cheerio'
|
|
4
|
+
import async from 'async'
|
|
5
|
+
import fs from 'fs/promises'
|
|
6
|
+
import path from 'path'
|
|
7
|
+
import { pipeline } from 'stream/promises'
|
|
8
|
+
import { createWriteStream } from 'fs'
|
|
9
|
+
import NSPPDFReader from './nsp-pdf-reader.mjs'
|
|
10
|
+
|
|
11
|
+
export class NSPVersion {
|
|
12
|
+
|
|
13
|
+
version
|
|
14
|
+
effective
|
|
15
|
+
|
|
16
|
+
files = []
|
|
17
|
+
|
|
18
|
+
constructor(version, effective) {
|
|
19
|
+
this.version = version
|
|
20
|
+
this.effective = effective
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
addFile(file) {
|
|
24
|
+
this.files.push(file)
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
async saveFiles(outputDir) {
|
|
28
|
+
try {
|
|
29
|
+
await fs.mkdir(outputDir)
|
|
30
|
+
} catch (e) {}
|
|
31
|
+
await async.forEach(this.files, async file => {
|
|
32
|
+
await file.download(outputDir)
|
|
33
|
+
})
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
export class NSPFile {
|
|
39
|
+
|
|
40
|
+
name
|
|
41
|
+
href
|
|
42
|
+
|
|
43
|
+
#nspVersion
|
|
44
|
+
#filePath
|
|
45
|
+
|
|
46
|
+
constructor(name, href, nspVersion) {
|
|
47
|
+
this.name = name
|
|
48
|
+
this.href = href
|
|
49
|
+
|
|
50
|
+
this.#nspVersion = nspVersion
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
async download(outputDir) {
|
|
54
|
+
this.#filePath = path.join(outputDir, `${this.#nspVersion.version} ${this.name}.pdf`)
|
|
55
|
+
|
|
56
|
+
let response = await fetch(constants.VLINE_CORPORATE_HOST + this.href)
|
|
57
|
+
let outputStream = createWriteStream(this.#filePath)
|
|
58
|
+
|
|
59
|
+
await pipeline(response.body, outputStream)
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
setFilePath(filePath) {
|
|
63
|
+
this.#filePath = filePath
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
async extractRuns() {
|
|
67
|
+
let reader = new NSPPDFReader(this.#filePath)
|
|
68
|
+
await reader.read()
|
|
69
|
+
return reader.getAllRuns()
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
export async function getNSPVersion() {
|
|
75
|
+
let body = await (await fetch(constants.VLINE_CORPORATE_HOST + constants.NSP_PAGE)).text()
|
|
76
|
+
let $ = parseHTML(body)
|
|
77
|
+
|
|
78
|
+
let buttons = Array.from($('div#publication-list > a.btn.button-file-link-caption'))
|
|
79
|
+
|
|
80
|
+
let nspVersions = {}
|
|
81
|
+
|
|
82
|
+
buttons.filter(button => $(button).text().trim().startsWith('FP')).forEach(button => {
|
|
83
|
+
let text = $(button).text()
|
|
84
|
+
let data = text.match(/(FP\w+) (\d+)-(\d+)-(\d+) (.+)/)
|
|
85
|
+
if (!data) return null
|
|
86
|
+
|
|
87
|
+
let [_, version, day, month, year, name] = data
|
|
88
|
+
|
|
89
|
+
if (!nspVersions[version]) nspVersions[version] = new NSPVersion(version, new Date(`${`20${year}`.slice(-4)}-${month}-${day}`))
|
|
90
|
+
|
|
91
|
+
let nspVersion = nspVersions[version]
|
|
92
|
+
nspVersion.addFile(new NSPFile(name.replace(/ NSP.+/, '').trim(), $(button).attr('href'), nspVersion))
|
|
93
|
+
})
|
|
94
|
+
|
|
95
|
+
return Object.values(nspVersions).sort((a, b) => b.effective - a.effective)
|
|
96
|
+
}
|
package/lib.mjs
ADDED
package/package.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@transportme/vline-nsp-reader",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"main": "index.js",
|
|
5
|
+
"scripts": {
|
|
6
|
+
"test": "mocha"
|
|
7
|
+
},
|
|
8
|
+
"author": "",
|
|
9
|
+
"license": "ISC",
|
|
10
|
+
"description": "",
|
|
11
|
+
"dependencies": {
|
|
12
|
+
"async": "^3.2.6",
|
|
13
|
+
"cheerio": "^1.0.0",
|
|
14
|
+
"node-fetch": "^3.3.2",
|
|
15
|
+
"pdf2json": "github:eyeballcode/pdf2json"
|
|
16
|
+
},
|
|
17
|
+
"devDependencies": {
|
|
18
|
+
"chai": "^5.1.2",
|
|
19
|
+
"mocha": "^10.8.2",
|
|
20
|
+
"nock": "^13.5.6",
|
|
21
|
+
"tmp-promise": "^3.0.3"
|
|
22
|
+
}
|
|
23
|
+
}
|
package/read-pdf.mjs
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import util from 'util'
|
|
2
|
+
import NSPPDFReader from './lib/nsp-pdf-reader.mjs'
|
|
3
|
+
import TableReader from './lib/table-reader.mjs'
|
|
4
|
+
|
|
5
|
+
// let nspReader = new NSPPDFReader(process.argv[2])
|
|
6
|
+
// await nspReader.read()
|
|
7
|
+
// console.log(util.inspect(nspReader.getAllRuns(), { depth: null, colors: true, maxArrayLength: null }))
|
|
8
|
+
|
|
9
|
+
let tableReader = new TableReader(process.argv[2])
|
|
10
|
+
let pages = await tableReader.read()
|
|
11
|
+
|
|
12
|
+
for (let page of pages) console.table(page)
|