pdf2sheets-helper 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +143 -0
- package/index.js +185 -0
- package/package.json +28 -0
package/README.md
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
# pdf2sheets-helper
|
|
2
|
+
|
|
3
|
+
Utility helpers for working with PDF table data in spreadsheet workflows. Clean, detect headers, normalize numbers, and export to CSV -- everything you need after extracting a table from a PDF.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npm install pdf2sheets-helper
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
```js
|
|
14
|
+
const {
|
|
15
|
+
cleanTable,
|
|
16
|
+
detectHeaders,
|
|
17
|
+
toCSV,
|
|
18
|
+
normalizeNumbers,
|
|
19
|
+
} = require("pdf2sheets-helper");
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
### cleanTable(rows)
|
|
23
|
+
|
|
24
|
+
Trims whitespace from every cell and removes rows that are entirely empty. Useful for cleaning up raw data after extracting a table from a PDF to Excel or Google Sheets.
|
|
25
|
+
|
|
26
|
+
```js
|
|
27
|
+
const raw = [
|
|
28
|
+
[" Name ", " Revenue "],
|
|
29
|
+
["", " "],
|
|
30
|
+
[" Acme Corp", " $1,200 "],
|
|
31
|
+
];
|
|
32
|
+
|
|
33
|
+
const cleaned = cleanTable(raw);
|
|
34
|
+
// [
|
|
35
|
+
// ["Name", "Revenue"],
|
|
36
|
+
// ["Acme Corp", "$1,200"]
|
|
37
|
+
// ]
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
### detectHeaders(rows)
|
|
41
|
+
|
|
42
|
+
Checks whether the first row looks like a header row (all non-numeric strings). Returns an object with `hasHeaders` and `headers` fields.
|
|
43
|
+
|
|
44
|
+
```js
|
|
45
|
+
const result = detectHeaders([
|
|
46
|
+
["Product", "Price", "Quantity"],
|
|
47
|
+
["Widget", "9.99", "50"],
|
|
48
|
+
]);
|
|
49
|
+
// { hasHeaders: true, headers: ["Product", "Price", "Quantity"] }
|
|
50
|
+
|
|
51
|
+
const noHeaders = detectHeaders([
|
|
52
|
+
["100", "200"],
|
|
53
|
+
["300", "400"],
|
|
54
|
+
]);
|
|
55
|
+
// { hasHeaders: false, headers: null }
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### toCSV(rows, delimiter?)
|
|
59
|
+
|
|
60
|
+
Converts a 2D array to a CSV string with proper escaping (RFC 4180). Cells containing the delimiter, double-quotes, or newlines are automatically quoted.
|
|
61
|
+
|
|
62
|
+
```js
|
|
63
|
+
const csv = toCSV([
|
|
64
|
+
["Name", "City"],
|
|
65
|
+
["Alice", "New York"],
|
|
66
|
+
['Bob "Builder"', "LA"],
|
|
67
|
+
]);
|
|
68
|
+
// Name,City
|
|
69
|
+
// Alice,New York
|
|
70
|
+
// "Bob ""Builder""",LA
|
|
71
|
+
|
|
72
|
+
// Tab-separated output
|
|
73
|
+
const tsv = toCSV([["A", "B"], ["1", "2"]], "\t");
|
|
74
|
+
// A\tB\n1\t2
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### normalizeNumbers(rows, columns?)
|
|
78
|
+
|
|
79
|
+
Strips currency symbols (`$`, `EUR`, `GBP`, etc.), thousands separators, and percent signs from specified columns, converting the values to JavaScript numbers. Handles both US (`1,234.56`) and European (`1.234,56`) formats.
|
|
80
|
+
|
|
81
|
+
If `columns` is omitted, all columns are processed.
|
|
82
|
+
|
|
83
|
+
```js
|
|
84
|
+
const data = [
|
|
85
|
+
["Item", "$1,200.50", "15%"],
|
|
86
|
+
["Widget", "EUR 300", "8%"],
|
|
87
|
+
];
|
|
88
|
+
|
|
89
|
+
const normalized = normalizeNumbers(data, [1, 2]);
|
|
90
|
+
// [
|
|
91
|
+
// ["Item", 1200.5, 15],
|
|
92
|
+
// ["Widget", 300, 8]
|
|
93
|
+
// ]
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Putting It All Together
|
|
97
|
+
|
|
98
|
+
A typical workflow for processing a PDF table to Google Sheets or Excel:
|
|
99
|
+
|
|
100
|
+
```js
|
|
101
|
+
const { cleanTable, detectHeaders, normalizeNumbers, toCSV } = require("pdf2sheets-helper");
|
|
102
|
+
|
|
103
|
+
// Raw data extracted from a PDF
|
|
104
|
+
const raw = [
|
|
105
|
+
[" Product ", " Price ", " Qty "],
|
|
106
|
+
["", "", ""],
|
|
107
|
+
[" Widget A", " $1,499.00 ", " 25 "],
|
|
108
|
+
[" Widget B", " $320.50 ", " 100 "],
|
|
109
|
+
];
|
|
110
|
+
|
|
111
|
+
// Step 1: Clean whitespace and empty rows
|
|
112
|
+
const cleaned = cleanTable(raw);
|
|
113
|
+
|
|
114
|
+
// Step 2: Check for headers
|
|
115
|
+
const { hasHeaders, headers } = detectHeaders(cleaned);
|
|
116
|
+
console.log("Headers:", headers);
|
|
117
|
+
// ["Product", "Price", "Qty"]
|
|
118
|
+
|
|
119
|
+
// Step 3: Normalize numeric columns
|
|
120
|
+
const dataRows = hasHeaders ? cleaned.slice(1) : cleaned;
|
|
121
|
+
const normalized = normalizeNumbers(dataRows, [1, 2]);
|
|
122
|
+
|
|
123
|
+
// Step 4: Export to CSV
|
|
124
|
+
const output = hasHeaders ? [headers, ...normalized] : normalized;
|
|
125
|
+
console.log(toCSV(output));
|
|
126
|
+
// Product,Price,Qty
|
|
127
|
+
// Widget A,1499,25
|
|
128
|
+
// Widget B,320.5,100
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## Browser-Based PDF Table Extraction
|
|
132
|
+
|
|
133
|
+
These helpers assume you already have table data as a 2D array. If you need to extract tables from PDFs first, try [pdf2sheets](https://pdf2sheets.app) -- a browser extension that lets you extract tables from any PDF directly into Google Sheets or Excel. No uploads, no server processing; everything runs locally in your browser.
|
|
134
|
+
|
|
135
|
+
It handles multi-page tables, merged cells, and messy PDF layouts so you can skip the manual copy-paste step entirely.
|
|
136
|
+
|
|
137
|
+
## Keywords
|
|
138
|
+
|
|
139
|
+
extract table from pdf to excel, pdf table to google sheets, pdf table extraction, convert pdf table to csv, pdf to spreadsheet, pdf data extraction
|
|
140
|
+
|
|
141
|
+
## License
|
|
142
|
+
|
|
143
|
+
MIT
|
package/index.js
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pdf2sheets-helper
|
|
3
|
+
*
|
|
4
|
+
* Utility helpers for working with PDF table data in spreadsheet workflows.
|
|
5
|
+
* Use these functions to clean, transform, and export 2D arrays of table data
|
|
6
|
+
* extracted from PDFs.
|
|
7
|
+
*
|
|
8
|
+
* https://pdf2sheets.app
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Clean a 2D table array by trimming whitespace from every cell
|
|
13
|
+
* and removing rows that are entirely empty.
|
|
14
|
+
*
|
|
15
|
+
* @param {string[][]} rows - 2D array of table data
|
|
16
|
+
* @returns {string[][]} Cleaned table with trimmed cells and no empty rows
|
|
17
|
+
*
|
|
18
|
+
* @example
|
|
19
|
+
* cleanTable([[" Name ", " Age"], ["", " "], ["Alice", "30"]])
|
|
20
|
+
* // => [["Name", "Age"], ["Alice", "30"]]
|
|
21
|
+
*/
|
|
22
|
+
function cleanTable(rows) {
|
|
23
|
+
if (!Array.isArray(rows)) {
|
|
24
|
+
throw new TypeError("cleanTable expects a 2D array of rows");
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
return rows
|
|
28
|
+
.map((row) => row.map((cell) => String(cell).trim()))
|
|
29
|
+
.filter((row) => row.some((cell) => cell.length > 0));
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Detect whether the first row of a table looks like a header row.
|
|
34
|
+
*
|
|
35
|
+
* A row is considered a header when every cell is a non-numeric string
|
|
36
|
+
* (i.e. none of the cells parse as a finite number). This is a simple
|
|
37
|
+
* heuristic that works well for most PDF-extracted tables.
|
|
38
|
+
*
|
|
39
|
+
* @param {string[][]} rows - 2D array of table data (at least one row)
|
|
40
|
+
* @returns {{ hasHeaders: boolean, headers: string[] | null }}
|
|
41
|
+
*
|
|
42
|
+
* @example
|
|
43
|
+
* detectHeaders([["Product", "Price"], ["Widget", "9.99"]])
|
|
44
|
+
* // => { hasHeaders: true, headers: ["Product", "Price"] }
|
|
45
|
+
*
|
|
46
|
+
* detectHeaders([["100", "200"], ["300", "400"]])
|
|
47
|
+
* // => { hasHeaders: false, headers: null }
|
|
48
|
+
*/
|
|
49
|
+
function detectHeaders(rows) {
|
|
50
|
+
if (!Array.isArray(rows) || rows.length === 0) {
|
|
51
|
+
return { hasHeaders: false, headers: null };
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
const firstRow = rows[0];
|
|
55
|
+
|
|
56
|
+
// If every cell in the first row is non-empty and non-numeric, treat it
|
|
57
|
+
// as a header row.
|
|
58
|
+
const looksLikeHeaders = firstRow.every((cell) => {
|
|
59
|
+
const trimmed = String(cell).trim();
|
|
60
|
+
return trimmed.length > 0 && !isFinite(Number(trimmed));
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
return {
|
|
64
|
+
hasHeaders: looksLikeHeaders,
|
|
65
|
+
headers: looksLikeHeaders ? firstRow.map((c) => String(c).trim()) : null,
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Convert a 2D array to a CSV string.
|
|
71
|
+
*
|
|
72
|
+
* Cells that contain the delimiter, a double-quote, or a newline are
|
|
73
|
+
* wrapped in double-quotes. Existing double-quotes inside cells are
|
|
74
|
+
* escaped by doubling them (RFC 4180).
|
|
75
|
+
*
|
|
76
|
+
* @param {string[][]} rows - 2D array of table data
|
|
77
|
+
* @param {string} [delimiter=","] - Column delimiter (e.g. "," or "\t")
|
|
78
|
+
* @returns {string} CSV-formatted string
|
|
79
|
+
*
|
|
80
|
+
* @example
|
|
81
|
+
* toCSV([["Name", "City"], ["Alice", "New York"]])
|
|
82
|
+
* // => 'Name,City\nAlice,New York'
|
|
83
|
+
*
|
|
84
|
+
* toCSV([["A", "B"], ["1", "2"]], "\t")
|
|
85
|
+
* // => 'A\tB\n1\t2'
|
|
86
|
+
*/
|
|
87
|
+
function toCSV(rows, delimiter) {
|
|
88
|
+
if (!Array.isArray(rows)) {
|
|
89
|
+
throw new TypeError("toCSV expects a 2D array of rows");
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
const sep = delimiter != null ? delimiter : ",";
|
|
93
|
+
|
|
94
|
+
return rows
|
|
95
|
+
.map((row) =>
|
|
96
|
+
row
|
|
97
|
+
.map((cell) => {
|
|
98
|
+
const value = String(cell);
|
|
99
|
+
|
|
100
|
+
// Wrap in quotes if the value contains the delimiter, a quote, or a newline
|
|
101
|
+
if (
|
|
102
|
+
value.includes(sep) ||
|
|
103
|
+
value.includes('"') ||
|
|
104
|
+
value.includes("\n") ||
|
|
105
|
+
value.includes("\r")
|
|
106
|
+
) {
|
|
107
|
+
return '"' + value.replace(/"/g, '""') + '"';
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
return value;
|
|
111
|
+
})
|
|
112
|
+
.join(sep)
|
|
113
|
+
)
|
|
114
|
+
.join("\n");
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Strip currency symbols and thousands separators from numeric-looking cells,
|
|
119
|
+
* converting them to actual numbers.
|
|
120
|
+
*
|
|
121
|
+
* Only the columns listed in the `columns` array are processed. All other
|
|
122
|
+
* columns are left untouched. If `columns` is omitted, every column is
|
|
123
|
+
* processed.
|
|
124
|
+
*
|
|
125
|
+
* Recognized formats: "$1,234.56", "1.234,56" (European), "USD 100", "100%".
|
|
126
|
+
*
|
|
127
|
+
* @param {string[][]} rows - 2D array of table data
|
|
128
|
+
* @param {number[]} [columns] - Zero-based column indices to normalize.
|
|
129
|
+
* If omitted, all columns are processed.
|
|
130
|
+
* @returns {(string|number)[][]} New table with normalized numeric values
|
|
131
|
+
*
|
|
132
|
+
* @example
|
|
133
|
+
* normalizeNumbers([["Item", "$1,200.50"], ["Widget", "$300"]], [1])
|
|
134
|
+
* // => [["Item", 1200.5], ["Widget", 300]]
|
|
135
|
+
*/
|
|
136
|
+
function normalizeNumbers(rows, columns) {
|
|
137
|
+
if (!Array.isArray(rows)) {
|
|
138
|
+
throw new TypeError("normalizeNumbers expects a 2D array of rows");
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
// Currency symbols and prefixes/suffixes to strip
|
|
142
|
+
const currencyPattern = /^[\s$\u20AC\u00A3\u00A5]*(.*?)[\s%]*$/; // $, EUR, GBP, JPY
|
|
143
|
+
const prefixPattern = /^(?:USD|EUR|GBP|JPY|CAD|AUD)[\s]*/i;
|
|
144
|
+
|
|
145
|
+
return rows.map((row) =>
|
|
146
|
+
row.map((cell, colIndex) => {
|
|
147
|
+
// Skip columns not in the target list
|
|
148
|
+
if (columns != null && !columns.includes(colIndex)) {
|
|
149
|
+
return cell;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
let value = String(cell).trim();
|
|
153
|
+
|
|
154
|
+
// Remove common currency prefixes like "USD "
|
|
155
|
+
value = value.replace(prefixPattern, "");
|
|
156
|
+
|
|
157
|
+
// Remove currency symbols and surrounding whitespace
|
|
158
|
+
const symbolMatch = value.match(currencyPattern);
|
|
159
|
+
if (symbolMatch) {
|
|
160
|
+
value = symbolMatch[1];
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// Detect European format: "1.234,56" (dot as thousands, comma as decimal)
|
|
164
|
+
if (/^\d{1,3}(\.\d{3})+(,\d+)?$/.test(value)) {
|
|
165
|
+
value = value.replace(/\./g, "").replace(",", ".");
|
|
166
|
+
} else {
|
|
167
|
+
// Standard format: remove commas used as thousands separators
|
|
168
|
+
value = value.replace(/,/g, "");
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// Remove trailing percent sign
|
|
172
|
+
value = value.replace(/%$/, "");
|
|
173
|
+
|
|
174
|
+
const num = Number(value);
|
|
175
|
+
return isFinite(num) && value.length > 0 ? num : cell;
|
|
176
|
+
})
|
|
177
|
+
);
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
module.exports = {
|
|
181
|
+
cleanTable,
|
|
182
|
+
detectHeaders,
|
|
183
|
+
toCSV,
|
|
184
|
+
normalizeNumbers,
|
|
185
|
+
};
|
package/package.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "pdf2sheets-helper",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Utility helpers for working with PDF table data in spreadsheet workflows",
|
|
5
|
+
"main": "index.js",
|
|
6
|
+
"homepage": "https://pdf2sheets.app",
|
|
7
|
+
"keywords": [
|
|
8
|
+
"pdf",
|
|
9
|
+
"google-sheets",
|
|
10
|
+
"excel",
|
|
11
|
+
"table-extraction",
|
|
12
|
+
"spreadsheet",
|
|
13
|
+
"pdf-to-excel",
|
|
14
|
+
"csv"
|
|
15
|
+
],
|
|
16
|
+
"license": "MIT",
|
|
17
|
+
"repository": {
|
|
18
|
+
"type": "git",
|
|
19
|
+
"url": "https://github.com/sgcaptainworks-arch/pdf2sheets-helper"
|
|
20
|
+
},
|
|
21
|
+
"author": "pdf2sheets",
|
|
22
|
+
"files": [
|
|
23
|
+
"index.js"
|
|
24
|
+
],
|
|
25
|
+
"engines": {
|
|
26
|
+
"node": ">=14.0.0"
|
|
27
|
+
}
|
|
28
|
+
}
|