pdf2sheets-helper 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +143 -0
  2. package/index.js +185 -0
  3. package/package.json +28 -0
package/README.md ADDED
@@ -0,0 +1,143 @@
1
+ # pdf2sheets-helper
2
+
3
+ Utility helpers for working with PDF table data in spreadsheet workflows. Clean, detect headers, normalize numbers, and export to CSV -- everything you need after extracting a table from a PDF.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ npm install pdf2sheets-helper
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ ```js
14
+ const {
15
+ cleanTable,
16
+ detectHeaders,
17
+ toCSV,
18
+ normalizeNumbers,
19
+ } = require("pdf2sheets-helper");
20
+ ```
21
+
22
+ ### cleanTable(rows)
23
+
24
+ Trims whitespace from every cell and removes rows that are entirely empty. Useful for cleaning up raw data after extracting a table from a PDF to Excel or Google Sheets.
25
+
26
+ ```js
27
+ const raw = [
28
+ [" Name ", " Revenue "],
29
+ ["", " "],
30
+ [" Acme Corp", " $1,200 "],
31
+ ];
32
+
33
+ const cleaned = cleanTable(raw);
34
+ // [
35
+ // ["Name", "Revenue"],
36
+ // ["Acme Corp", "$1,200"]
37
+ // ]
38
+ ```
39
+
40
+ ### detectHeaders(rows)
41
+
42
+ Checks whether the first row looks like a header row (all non-numeric strings). Returns an object with `hasHeaders` and `headers` fields.
43
+
44
+ ```js
45
+ const result = detectHeaders([
46
+ ["Product", "Price", "Quantity"],
47
+ ["Widget", "9.99", "50"],
48
+ ]);
49
+ // { hasHeaders: true, headers: ["Product", "Price", "Quantity"] }
50
+
51
+ const noHeaders = detectHeaders([
52
+ ["100", "200"],
53
+ ["300", "400"],
54
+ ]);
55
+ // { hasHeaders: false, headers: null }
56
+ ```
57
+
58
+ ### toCSV(rows, delimiter?)
59
+
60
+ Converts a 2D array to a CSV string with proper escaping (RFC 4180). Cells containing the delimiter, double-quotes, or newlines are automatically quoted.
61
+
62
+ ```js
63
+ const csv = toCSV([
64
+ ["Name", "City"],
65
+ ["Alice", "New York"],
66
+ ['Bob "Builder"', "LA"],
67
+ ]);
68
+ // Name,City
69
+ // Alice,New York
70
+ // "Bob ""Builder""",LA
71
+
72
+ // Tab-separated output
73
+ const tsv = toCSV([["A", "B"], ["1", "2"]], "\t");
74
+ // A\tB\n1\t2
75
+ ```
76
+
77
+ ### normalizeNumbers(rows, columns?)
78
+
79
+ Strips currency symbols (`$`, `EUR`, `GBP`, etc.), thousands separators, and percent signs from specified columns, converting the values to JavaScript numbers. Handles both US (`1,234.56`) and European (`1.234,56`) formats.
80
+
81
+ If `columns` is omitted, all columns are processed.
82
+
83
+ ```js
84
+ const data = [
85
+ ["Item", "$1,200.50", "15%"],
86
+ ["Widget", "EUR 300", "8%"],
87
+ ];
88
+
89
+ const normalized = normalizeNumbers(data, [1, 2]);
90
+ // [
91
+ // ["Item", 1200.5, 15],
92
+ // ["Widget", 300, 8]
93
+ // ]
94
+ ```
95
+
96
+ ## Putting It All Together
97
+
98
+ A typical workflow for processing a PDF table to Google Sheets or Excel:
99
+
100
+ ```js
101
+ const { cleanTable, detectHeaders, normalizeNumbers, toCSV } = require("pdf2sheets-helper");
102
+
103
+ // Raw data extracted from a PDF
104
+ const raw = [
105
+ [" Product ", " Price ", " Qty "],
106
+ ["", "", ""],
107
+ [" Widget A", " $1,499.00 ", " 25 "],
108
+ [" Widget B", " $320.50 ", " 100 "],
109
+ ];
110
+
111
+ // Step 1: Clean whitespace and empty rows
112
+ const cleaned = cleanTable(raw);
113
+
114
+ // Step 2: Check for headers
115
+ const { hasHeaders, headers } = detectHeaders(cleaned);
116
+ console.log("Headers:", headers);
117
+ // ["Product", "Price", "Qty"]
118
+
119
+ // Step 3: Normalize numeric columns
120
+ const dataRows = hasHeaders ? cleaned.slice(1) : cleaned;
121
+ const normalized = normalizeNumbers(dataRows, [1, 2]);
122
+
123
+ // Step 4: Export to CSV
124
+ const output = hasHeaders ? [headers, ...normalized] : normalized;
125
+ console.log(toCSV(output));
126
+ // Product,Price,Qty
127
+ // Widget A,1499,25
128
+ // Widget B,320.5,100
129
+ ```
130
+
131
+ ## Browser-Based PDF Table Extraction
132
+
133
+ These helpers assume you already have table data as a 2D array. If you need to extract tables from PDFs first, try [pdf2sheets](https://pdf2sheets.app) -- a browser extension that lets you extract tables from any PDF directly into Google Sheets or Excel. No uploads, no server processing; everything runs locally in your browser.
134
+
135
+ It handles multi-page tables, merged cells, and messy PDF layouts so you can skip the manual copy-paste step entirely.
136
+
137
+ ## Keywords
138
+
139
+ extract table from pdf to excel, pdf table to google sheets, pdf table extraction, convert pdf table to csv, pdf to spreadsheet, pdf data extraction
140
+
141
+ ## License
142
+
143
+ MIT
package/index.js ADDED
@@ -0,0 +1,185 @@
1
+ /**
2
+ * pdf2sheets-helper
3
+ *
4
+ * Utility helpers for working with PDF table data in spreadsheet workflows.
5
+ * Use these functions to clean, transform, and export 2D arrays of table data
6
+ * extracted from PDFs.
7
+ *
8
+ * https://pdf2sheets.app
9
+ */
10
+
11
+ /**
12
+ * Clean a 2D table array by trimming whitespace from every cell
13
+ * and removing rows that are entirely empty.
14
+ *
15
+ * @param {string[][]} rows - 2D array of table data
16
+ * @returns {string[][]} Cleaned table with trimmed cells and no empty rows
17
+ *
18
+ * @example
19
+ * cleanTable([[" Name ", " Age"], ["", " "], ["Alice", "30"]])
20
+ * // => [["Name", "Age"], ["Alice", "30"]]
21
+ */
22
+ function cleanTable(rows) {
23
+ if (!Array.isArray(rows)) {
24
+ throw new TypeError("cleanTable expects a 2D array of rows");
25
+ }
26
+
27
+ return rows
28
+ .map((row) => row.map((cell) => String(cell).trim()))
29
+ .filter((row) => row.some((cell) => cell.length > 0));
30
+ }
31
+
32
+ /**
33
+ * Detect whether the first row of a table looks like a header row.
34
+ *
35
+ * A row is considered a header when every cell is a non-numeric string
36
+ * (i.e. none of the cells parse as a finite number). This is a simple
37
+ * heuristic that works well for most PDF-extracted tables.
38
+ *
39
+ * @param {string[][]} rows - 2D array of table data (at least one row)
40
+ * @returns {{ hasHeaders: boolean, headers: string[] | null }}
41
+ *
42
+ * @example
43
+ * detectHeaders([["Product", "Price"], ["Widget", "9.99"]])
44
+ * // => { hasHeaders: true, headers: ["Product", "Price"] }
45
+ *
46
+ * detectHeaders([["100", "200"], ["300", "400"]])
47
+ * // => { hasHeaders: false, headers: null }
48
+ */
49
+ function detectHeaders(rows) {
50
+ if (!Array.isArray(rows) || rows.length === 0) {
51
+ return { hasHeaders: false, headers: null };
52
+ }
53
+
54
+ const firstRow = rows[0];
55
+
56
+ // If every cell in the first row is non-empty and non-numeric, treat it
57
+ // as a header row.
58
+ const looksLikeHeaders = firstRow.every((cell) => {
59
+ const trimmed = String(cell).trim();
60
+ return trimmed.length > 0 && !isFinite(Number(trimmed));
61
+ });
62
+
63
+ return {
64
+ hasHeaders: looksLikeHeaders,
65
+ headers: looksLikeHeaders ? firstRow.map((c) => String(c).trim()) : null,
66
+ };
67
+ }
68
+
69
+ /**
70
+ * Convert a 2D array to a CSV string.
71
+ *
72
+ * Cells that contain the delimiter, a double-quote, or a newline are
73
+ * wrapped in double-quotes. Existing double-quotes inside cells are
74
+ * escaped by doubling them (RFC 4180).
75
+ *
76
+ * @param {string[][]} rows - 2D array of table data
77
+ * @param {string} [delimiter=","] - Column delimiter (e.g. "," or "\t")
78
+ * @returns {string} CSV-formatted string
79
+ *
80
+ * @example
81
+ * toCSV([["Name", "City"], ["Alice", "New York"]])
82
+ * // => 'Name,City\nAlice,New York'
83
+ *
84
+ * toCSV([["A", "B"], ["1", "2"]], "\t")
85
+ * // => 'A\tB\n1\t2'
86
+ */
87
+ function toCSV(rows, delimiter) {
88
+ if (!Array.isArray(rows)) {
89
+ throw new TypeError("toCSV expects a 2D array of rows");
90
+ }
91
+
92
+ const sep = delimiter != null ? delimiter : ",";
93
+
94
+ return rows
95
+ .map((row) =>
96
+ row
97
+ .map((cell) => {
98
+ const value = String(cell);
99
+
100
+ // Wrap in quotes if the value contains the delimiter, a quote, or a newline
101
+ if (
102
+ value.includes(sep) ||
103
+ value.includes('"') ||
104
+ value.includes("\n") ||
105
+ value.includes("\r")
106
+ ) {
107
+ return '"' + value.replace(/"/g, '""') + '"';
108
+ }
109
+
110
+ return value;
111
+ })
112
+ .join(sep)
113
+ )
114
+ .join("\n");
115
+ }
116
+
117
+ /**
118
+ * Strip currency symbols and thousands separators from numeric-looking cells,
119
+ * converting them to actual numbers.
120
+ *
121
+ * Only the columns listed in the `columns` array are processed. All other
122
+ * columns are left untouched. If `columns` is omitted, every column is
123
+ * processed.
124
+ *
125
+ * Recognized formats: "$1,234.56", "1.234,56" (European), "USD 100", "100%".
126
+ *
127
+ * @param {string[][]} rows - 2D array of table data
128
+ * @param {number[]} [columns] - Zero-based column indices to normalize.
129
+ * If omitted, all columns are processed.
130
+ * @returns {(string|number)[][]} New table with normalized numeric values
131
+ *
132
+ * @example
133
+ * normalizeNumbers([["Item", "$1,200.50"], ["Widget", "$300"]], [1])
134
+ * // => [["Item", 1200.5], ["Widget", 300]]
135
+ */
136
+ function normalizeNumbers(rows, columns) {
137
+ if (!Array.isArray(rows)) {
138
+ throw new TypeError("normalizeNumbers expects a 2D array of rows");
139
+ }
140
+
141
+ // Currency symbols and prefixes/suffixes to strip
142
+ const currencyPattern = /^[\s$\u20AC\u00A3\u00A5]*(.*?)[\s%]*$/; // $, EUR, GBP, JPY
143
+ const prefixPattern = /^(?:USD|EUR|GBP|JPY|CAD|AUD)[\s]*/i;
144
+
145
+ return rows.map((row) =>
146
+ row.map((cell, colIndex) => {
147
+ // Skip columns not in the target list
148
+ if (columns != null && !columns.includes(colIndex)) {
149
+ return cell;
150
+ }
151
+
152
+ let value = String(cell).trim();
153
+
154
+ // Remove common currency prefixes like "USD "
155
+ value = value.replace(prefixPattern, "");
156
+
157
+ // Remove currency symbols and surrounding whitespace
158
+ const symbolMatch = value.match(currencyPattern);
159
+ if (symbolMatch) {
160
+ value = symbolMatch[1];
161
+ }
162
+
163
+ // Detect European format: "1.234,56" (dot as thousands, comma as decimal)
164
+ if (/^\d{1,3}(\.\d{3})+(,\d+)?$/.test(value)) {
165
+ value = value.replace(/\./g, "").replace(",", ".");
166
+ } else {
167
+ // Standard format: remove commas used as thousands separators
168
+ value = value.replace(/,/g, "");
169
+ }
170
+
171
+ // Remove trailing percent sign
172
+ value = value.replace(/%$/, "");
173
+
174
+ const num = Number(value);
175
+ return isFinite(num) && value.length > 0 ? num : cell;
176
+ })
177
+ );
178
+ }
179
+
180
+ module.exports = {
181
+ cleanTable,
182
+ detectHeaders,
183
+ toCSV,
184
+ normalizeNumbers,
185
+ };
package/package.json ADDED
@@ -0,0 +1,28 @@
1
+ {
2
+ "name": "pdf2sheets-helper",
3
+ "version": "1.0.0",
4
+ "description": "Utility helpers for working with PDF table data in spreadsheet workflows",
5
+ "main": "index.js",
6
+ "homepage": "https://pdf2sheets.app",
7
+ "keywords": [
8
+ "pdf",
9
+ "google-sheets",
10
+ "excel",
11
+ "table-extraction",
12
+ "spreadsheet",
13
+ "pdf-to-excel",
14
+ "csv"
15
+ ],
16
+ "license": "MIT",
17
+ "repository": {
18
+ "type": "git",
19
+ "url": "https://github.com/sgcaptainworks-arch/pdf2sheets-helper"
20
+ },
21
+ "author": "pdf2sheets",
22
+ "files": [
23
+ "index.js"
24
+ ],
25
+ "engines": {
26
+ "node": ">=14.0.0"
27
+ }
28
+ }