sec-edgar-api 0.1.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/LICENSE.md +21 -0
  2. package/README.md +3 -5
  3. package/build/services/DocumentParser/HtmlTableExtractor.d.ts +41 -0
  4. package/build/services/DocumentParser/HtmlTableExtractor.js +408 -0
  5. package/build/services/DocumentParser/XMLParser.d.ts +20 -5
  6. package/build/services/DocumentParser/XMLParser.js +122 -118
  7. package/build/services/DocumentParser/parsers/index.d.ts +5 -3
  8. package/build/services/DocumentParser/parsers/index.js +5 -3
  9. package/build/services/DocumentParser/parsers/parse-current-filings.d.ts +3 -0
  10. package/build/services/DocumentParser/parsers/parse-current-filings.js +98 -0
  11. package/build/services/DocumentParser/parsers/parse-form-13f.d.ts +6 -0
  12. package/build/services/DocumentParser/parsers/parse-form-13f.js +91 -0
  13. package/build/services/DocumentParser/parsers/parse-form-13g.js +2 -2
  14. package/build/services/DocumentParser/parsers/parse-form-4.d.ts +6 -1
  15. package/build/services/DocumentParser/parsers/parse-form-4.js +134 -204
  16. package/build/services/DocumentParser/parsers/parse-form-def14a.d.ts +1 -2
  17. package/build/services/DocumentParser/parsers/parse-form-def14a.js +157 -106
  18. package/build/services/ReportBuilder/FactFiscalCalculator.d.ts +47 -0
  19. package/build/services/ReportBuilder/FactFiscalCalculator.js +228 -0
  20. package/build/services/ReportBuilder/FactPeriodResolver.d.ts +44 -0
  21. package/build/services/ReportBuilder/FactPeriodResolver.js +185 -0
  22. package/build/services/ReportBuilder/FactRecordBuilder.d.ts +7 -0
  23. package/build/services/ReportBuilder/FactRecordBuilder.js +49 -0
  24. package/build/services/ReportBuilder/FactSplitAdjuster.d.ts +39 -0
  25. package/build/services/ReportBuilder/FactSplitAdjuster.js +192 -0
  26. package/build/services/ReportBuilder/ReportBuilder.d.ts +37 -0
  27. package/build/services/ReportBuilder/ReportBuilder.js +180 -0
  28. package/build/services/ReportBuilder/ReportRawResolvable.d.ts +17 -0
  29. package/build/services/ReportBuilder/ReportRawResolvable.js +114 -0
  30. package/build/services/ReportBuilder/index.d.ts +2 -0
  31. package/build/services/ReportBuilder/index.js +4 -0
  32. package/build/services/ReportParser/FactItem.d.ts +66 -0
  33. package/build/services/ReportParser/FactItem.js +50 -0
  34. package/build/services/ReportParser/FactItemFactory.d.ts +22 -0
  35. package/build/services/ReportParser/FactItemFactory.js +150 -0
  36. package/build/services/ReportParser/FactSplitMap.d.ts +16 -0
  37. package/build/services/ReportParser/FactSplitMap.js +101 -0
  38. package/build/services/ReportParser/PropertyResolver.d.ts +1 -0
  39. package/build/services/ReportParser/PropertyResolver.js +1 -0
  40. package/build/services/ReportParser/ReportParser.d.ts +3 -10
  41. package/build/services/ReportParser/ReportParser.js +8 -23
  42. package/build/services/ReportParser/ReportRawParser.d.ts +5 -28
  43. package/build/services/ReportParser/ReportRawParser.js +29 -141
  44. package/build/services/ReportParser/ReportWrapper.js +2 -5
  45. package/build/services/ReportParser/resolvers/index.d.ts +2 -0
  46. package/build/services/ReportParser/resolvers/index.js +2 -0
  47. package/build/services/ReportParser/resolvers/resolve-cash-flow-capex.js +4 -3
  48. package/build/services/ReportParser/resolvers/resolve-cash-flow-operating.js +1 -1
  49. package/build/services/ReportParser/resolvers/resolve-cash-flow-working-capital-non-cash.js +1 -1
  50. package/build/services/ReportParser/resolvers/resolve-expense-depreciation.js +1 -1
  51. package/build/services/ReportParser/resolvers/resolve-fiscal-year-cumulative-properties.js +28 -14
  52. package/build/services/ReportParser/resolvers/resolve-q4-fiscal-year-matching-properties.js +32 -4
  53. package/build/services/ReportParser/resolvers/resolve-split-ratio.d.ts +2 -0
  54. package/build/services/ReportParser/resolvers/resolve-split-ratio.js +37 -0
  55. package/build/services/SecEdgarApi/SecEdgarApi.d.ts +85 -47
  56. package/build/services/SecEdgarApi/SecEdgarApi.js +246 -108
  57. package/build/types/current-filings-xml.type.d.ts +74 -0
  58. package/build/types/current-filings-xml.type.js +6 -0
  59. package/build/types/current-filings.type.d.ts +44 -0
  60. package/build/types/current-filings.type.js +2 -0
  61. package/build/types/form-13f-xml.type.d.ts +105 -0
  62. package/build/types/form-13f-xml.type.js +2 -0
  63. package/build/types/form-4-xml.type.d.ts +132 -0
  64. package/build/types/form-4-xml.type.js +2 -0
  65. package/build/types/index.d.ts +2 -2
  66. package/build/types/index.js +2 -2
  67. package/build/types/parsed-filings.type.d.ts +144 -5
  68. package/build/types/report-raw.type.d.ts +4 -7
  69. package/build/types/report-translated.type.d.ts +1 -2
  70. package/build/types/submission.type.d.ts +3 -2
  71. package/build/util/calculation-map-by-ns.d.ts +6 -0
  72. package/build/util/calculation-map-by-ns.js +9 -0
  73. package/build/util/key-translations.js +1 -2
  74. package/package.json +6 -3
package/LICENSE.md ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Andrew Evers
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md CHANGED
@@ -24,12 +24,10 @@ Reports are all returned as a uniform interface:
24
24
 
25
25
  ```TS
26
26
  interface ReportTranslated {
27
- dateReport: string
27
+ dateReport: string
28
28
  dateFiled: string
29
29
  fiscalPeriod: FiscalPeriod
30
30
  fiscalYear: number
31
- form: string
32
- isTTM: boolean
33
31
 
34
32
  assetTotal: number | null
35
33
  assetCurrent: number | null
@@ -94,6 +92,8 @@ interface ReportTranslated {
94
92
  cashFlowDeferredTax: number | null
95
93
 
96
94
  cashFlowWorkingCapitalNonCash: number | null
95
+
96
+ splitRatio: number | null
97
97
  }
98
98
  ```
99
99
 
@@ -114,8 +114,6 @@ const reports = await secEdgarApi.getReports({ symbol: 'AAPL' })
114
114
 
115
115
  ## Resolvers
116
116
 
117
- **WARNING** Still in testing. Values may not be accurate for all companies since the properties provided in the reports differ.
118
-
119
117
  The main problem with the edgar API is that the property names and data provided are not uniform. You have to deal with companies omitting important data
120
118
  in some filings, or using different property keys for the same data point.
121
119
 
@@ -0,0 +1,41 @@
1
+ export interface Cell {
2
+ attributes: string;
3
+ rowSpan: number;
4
+ colSpan: number;
5
+ rowIndex: number;
6
+ colIndex: number;
7
+ tableCellIndex: number;
8
+ html: string;
9
+ isHeaderRowCell: boolean;
10
+ isBodyTitleRowCell: boolean;
11
+ valueParsed: string | number | null;
12
+ headerCol: string | null;
13
+ headerRowIndex: number | null;
14
+ }
15
+ export interface TableHTMLData {
16
+ tableIndex: number;
17
+ parentTableIndex: number | null;
18
+ childTableIndexes: number[];
19
+ positionStart: number;
20
+ positionEnd: number;
21
+ htmlBefore: string;
22
+ html: string;
23
+ rows: Cell[][];
24
+ }
25
+ interface ParseOptions {
26
+ tagsToExclude?: string[];
27
+ tagsToInclude?: string[];
28
+ stripParenthesis?: boolean;
29
+ stripHtml?: boolean;
30
+ }
31
+ export default class HtmlTableExtractor {
32
+ extractTables(html: string, options?: ParseOptions): TableHTMLData[];
33
+ mergeHeaderRows(tables: TableHTMLData[]): void;
34
+ removeEmptyColumns(tables: TableHTMLData[]): TableHTMLData[];
35
+ private addTableCells;
36
+ private addMissingNameCol;
37
+ private addTableCellValues;
38
+ stripHtml(str: string, options?: Omit<ParseOptions, 'stripHtml' | 'stripParenthesis'>): string;
39
+ parseValue(str: string | number | null, options?: ParseOptions): string | number | null;
40
+ }
41
+ export {};
@@ -0,0 +1,408 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ var XMLParser_1 = require("./XMLParser");
4
+ var HtmlTableExtractor = /** @class */ (function () {
5
+ function HtmlTableExtractor() {
6
+ }
7
+ HtmlTableExtractor.prototype.extractTables = function (html, options) {
8
+ var _a;
9
+ var tablesOpen = [];
10
+ var tableIndex = -1;
11
+ var htmlBefore = '';
12
+ var tablesData = [];
13
+ for (var i = 0; i < html.length; i++) {
14
+ var isTableStart = html.substring(i, i + 6).toLowerCase() === '<table';
15
+ var isTableEnd = html.substring(i - 7, i + 1).toLowerCase() === '</table>';
16
+ var parentTable = tablesOpen[tablesOpen.length - 1];
17
+ if (isTableStart) {
18
+ tableIndex++;
19
+ tablesOpen.push({
20
+ tableIndex: tableIndex,
21
+ parentTableIndex: (_a = parentTable === null || parentTable === void 0 ? void 0 : parentTable.tableIndex) !== null && _a !== void 0 ? _a : null,
22
+ childTableIndexes: [],
23
+ positionStart: i,
24
+ positionEnd: -1,
25
+ htmlBefore: htmlBefore,
26
+ html: '',
27
+ rows: [],
28
+ });
29
+ parentTable === null || parentTable === void 0 ? void 0 : parentTable.childTableIndexes.push(tableIndex);
30
+ htmlBefore = '';
31
+ }
32
+ if (tablesOpen.length === 0) {
33
+ htmlBefore += html[i];
34
+ }
35
+ for (var a = tablesOpen.length - 1; a >= 0; a--) {
36
+ tablesOpen[a].html += html[i];
37
+ }
38
+ if (isTableEnd && tablesOpen.length > 0) {
39
+ tablesOpen[tablesOpen.length - 1].positionEnd = i;
40
+ var tableData = tablesOpen.pop();
41
+ tablesData[tableData.tableIndex] = tableData;
42
+ }
43
+ }
44
+ this.addTableCells(tablesData);
45
+ this.addTableCellValues(tablesData, options);
46
+ this.addMissingNameCol(tablesData);
47
+ this.removeEmptyColumns(tablesData);
48
+ this.mergeHeaderRows(tablesData);
49
+ return tablesData.filter(Boolean);
50
+ };
51
+ HtmlTableExtractor.prototype.mergeHeaderRows = function (tables) {
52
+ var _a;
53
+ for (var _i = 0, tables_1 = tables; _i < tables_1.length; _i++) {
54
+ var table = tables_1[_i];
55
+ var bodyRowIndex = table.rows.findIndex(function (row) { return row.some(function (col) { return !col.isHeaderRowCell; }); });
56
+ var headerRowIndex = bodyRowIndex - 1;
57
+ var bodyRow = table.rows[bodyRowIndex];
58
+ var headerRow = table.rows[headerRowIndex];
59
+ if (!bodyRow || headerRowIndex < 0)
60
+ continue;
61
+ for (var i = 0; i < bodyRow.length; i++) {
62
+ var headerCol = headerRow[i];
63
+ var bodyCol = bodyRow[i];
64
+ headerCol.valueParsed = (_a = bodyCol.headerCol) !== null && _a !== void 0 ? _a : headerCol.valueParsed;
65
+ }
66
+ }
67
+ };
68
+ HtmlTableExtractor.prototype.removeEmptyColumns = function (tables) {
69
+ var _loop_1 = function (table) {
70
+ var emptyColumns = new Set();
71
+ var _loop_2 = function (c) {
72
+ var isAllEmpty = table.rows.every(function (row) { var _a, _b; return ((_a = row[c]) === null || _a === void 0 ? void 0 : _a.valueParsed) === null || ((_b = row[c]) === null || _b === void 0 ? void 0 : _b.isHeaderRowCell); });
73
+ if (isAllEmpty) {
74
+ emptyColumns.add(c);
75
+ }
76
+ };
77
+ for (var c = 0; c < table.rows[0].length; c++) {
78
+ _loop_2(c);
79
+ }
80
+ for (var r = 0; r < table.rows.length; r++) {
81
+ var row = table.rows[r];
82
+ table.rows[r] = row.filter(function (_, i) { return !emptyColumns.has(i); });
83
+ }
84
+ table.rows = table.rows.filter(function (row) { return row.some(function (col) { return col.valueParsed !== null; }); });
85
+ };
86
+ for (var _i = 0, tables_2 = tables; _i < tables_2.length; _i++) {
87
+ var table = tables_2[_i];
88
+ _loop_1(table);
89
+ }
90
+ return tables;
91
+ };
92
+ HtmlTableExtractor.prototype.addTableCells = function (tables) {
93
+ var tablesByIndex = new Map(tables.map(function (t) { return [t.tableIndex, t]; }));
94
+ tablesByIndex.forEach(function (table) {
95
+ var _a, _b;
96
+ var skipIndexMap = new Map(table.childTableIndexes.map(function (childIndex) {
97
+ var _a, _b;
98
+ var child = tablesByIndex.get(childIndex);
99
+ return [
100
+ ((_a = child === null || child === void 0 ? void 0 : child.positionStart) !== null && _a !== void 0 ? _a : 0) - table.positionStart,
101
+ ((_b = child === null || child === void 0 ? void 0 : child.positionEnd) !== null && _b !== void 0 ? _b : 0) - table.positionStart,
102
+ ];
103
+ }));
104
+ var grid = [];
105
+ var isInCell = false;
106
+ var isInCellAtts = false;
107
+ var cellAtts = '';
108
+ var cellHTML = '';
109
+ var rowIndex = -1;
110
+ var tableCellIndex = -1;
111
+ var createCell = function (html, atts) {
112
+ var _a, _b, _c, _d, _e, _f;
113
+ var attributePairs = atts
114
+ .toLowerCase()
115
+ .split(' ')
116
+ .map(function (att) { return att.split('='); });
117
+ var rowSpan = Number((_b = (_a = attributePairs.find(function (_a) {
118
+ var key = _a[0];
119
+ return key === 'rowspan';
120
+ })) === null || _a === void 0 ? void 0 : _a[1]) === null || _b === void 0 ? void 0 : _b.replace(/[^0-9]/g, '')) || 1;
121
+ var colSpan = Number((_d = (_c = attributePairs.find(function (_a) {
122
+ var key = _a[0];
123
+ return key === 'colspan';
124
+ })) === null || _c === void 0 ? void 0 : _c[1]) === null || _d === void 0 ? void 0 : _d.replace(/[^0-9]/g, '')) || 1;
125
+ var cell = {
126
+ attributes: atts.length > 4 ? atts.substring(4, atts.length - 1) : '',
127
+ html: html,
128
+ colSpan: colSpan,
129
+ rowSpan: rowSpan,
130
+ tableCellIndex: tableCellIndex,
131
+ rowIndex: rowIndex,
132
+ colIndex: -1,
133
+ isHeaderRowCell: false,
134
+ isBodyTitleRowCell: false,
135
+ valueParsed: null,
136
+ headerCol: null,
137
+ headerRowIndex: null,
138
+ };
139
+ // const hasCopies = cell.colSpan > 1 || cell.rowSpan > 1
140
+ var curRow = (_e = grid[rowIndex]) !== null && _e !== void 0 ? _e : [];
141
+ var nextEmptyCellIndex = curRow.findIndex(function (cell) { return !cell; });
142
+ var idxStart = nextEmptyCellIndex === -1 ? curRow.length : nextEmptyCellIndex;
143
+ for (var r = rowIndex; r < rowIndex + rowSpan; r++) {
144
+ grid[r] = (_f = grid[r]) !== null && _f !== void 0 ? _f : [];
145
+ for (var c = idxStart; c < idxStart + colSpan; c++) {
146
+ cell.colIndex = cell.colIndex > -1 ? cell.colIndex : c;
147
+ grid[r][c] = cell;
148
+ }
149
+ }
150
+ };
151
+ for (var i = 0; i < table.html.length; i++) {
152
+ var skipIndex = (_a = skipIndexMap.get(i)) !== null && _a !== void 0 ? _a : null;
153
+ if (skipIndex) {
154
+ cellHTML += table.html.substring(i, skipIndex + 1);
155
+ i = skipIndex;
156
+ continue;
157
+ }
158
+ var prev5Chars = table.html.substring(i - 4, i + 1).toLowerCase();
159
+ var next3Chars = table.html.substring(i, i + 3).toLowerCase();
160
+ var isCellAttsStart = ['<td', '<th'].includes(next3Chars);
161
+ var isSelfEnclosed = isInCellAtts && table.html[i - 1] === '/' && table.html[i] === '>';
162
+ var isCellAttsEnd = (isInCell && table.html[i] === '>') || isSelfEnclosed;
163
+ var isCellEnd = ['</td>', '</th>'].includes(prev5Chars);
164
+ var isRowStart = next3Chars === '<tr';
165
+ if (isRowStart) {
166
+ rowIndex++;
167
+ grid[rowIndex] = (_b = grid[rowIndex]) !== null && _b !== void 0 ? _b : [];
168
+ }
169
+ if (isCellAttsStart) {
170
+ tableCellIndex++;
171
+ isInCell = true;
172
+ isInCellAtts = true;
173
+ }
174
+ if (isInCellAtts) {
175
+ cellAtts += table.html[i];
176
+ }
177
+ if (isInCell) {
178
+ cellHTML += table.html[i];
179
+ }
180
+ if (isCellAttsEnd) {
181
+ isInCellAtts = false;
182
+ }
183
+ if (isCellEnd || isSelfEnclosed) {
184
+ isInCell = false;
185
+ isInCellAtts = false;
186
+ createCell(cellHTML, cellAtts);
187
+ cellHTML = '';
188
+ cellAtts = '';
189
+ }
190
+ }
191
+ table.rows = grid;
192
+ });
193
+ };
194
+ HtmlTableExtractor.prototype.addMissingNameCol = function (tables) {
195
+ var _a, _b;
196
+ var _loop_3 = function (table) {
197
+ var bodyIndex = table.rows.findIndex(function (row) { return row.some(function (col) { return !col.isHeaderRowCell; }); });
198
+ // get the first column index that has a value
199
+ var firstPopulatedColIndex = Infinity;
200
+ for (var i = bodyIndex; i < table.rows.length; i++) {
201
+ var row = table.rows[i];
202
+ var populatedIndex = row.findIndex(function (col) { return col.valueParsed; });
203
+ var isFirstPopulatedIndex = populatedIndex > -1 && populatedIndex < firstPopulatedColIndex;
204
+ if (isFirstPopulatedIndex)
205
+ firstPopulatedColIndex = populatedIndex;
206
+ if (firstPopulatedColIndex === 0)
207
+ break;
208
+ }
209
+ var shouldAddName = table.rows.some(function (row) {
210
+ var firstCol = row[firstPopulatedColIndex];
211
+ var headerCol = firstCol === null || firstCol === void 0 ? void 0 : firstCol.headerCol;
212
+ // skip if the first column has a header col, or if there is no header row
213
+ if (!firstCol || headerCol || firstCol.headerRowIndex === null) {
214
+ return false;
215
+ }
216
+ // if the first col is a string, assume it's a name
217
+ return typeof firstCol.valueParsed === 'string';
218
+ });
219
+ if (shouldAddName) {
220
+ for (var _c = 0, _d = table.rows; _c < _d.length; _c++) {
221
+ var row = _d[_c];
222
+ var col = row[firstPopulatedColIndex];
223
+ if (!col)
224
+ continue;
225
+ var isEmptyRow = row.every(function (col) { return col.valueParsed === null; });
226
+ // for header rows, add to valueParsed, body rows, set headerCol
227
+ if (!isEmptyRow && col.isHeaderRowCell) {
228
+ col.valueParsed = (_a = col.valueParsed) !== null && _a !== void 0 ? _a : '[name]';
229
+ }
230
+ else if (!col.isHeaderRowCell) {
231
+ col.headerCol = (_b = col.headerCol) !== null && _b !== void 0 ? _b : '[name]';
232
+ }
233
+ }
234
+ }
235
+ };
236
+ for (var _i = 0, tables_3 = tables; _i < tables_3.length; _i++) {
237
+ var table = tables_3[_i];
238
+ _loop_3(table);
239
+ }
240
+ };
241
+ HtmlTableExtractor.prototype.addTableCellValues = function (tables, options) {
242
+ var _this = this;
243
+ var _a;
244
+ var getHeaderRowIndex = function (rows) {
245
+ var bodyIndex = rows.findIndex(function (row, r) {
246
+ var _a;
247
+ var prevRow = (_a = rows[r - 1]) !== null && _a !== void 0 ? _a : [];
248
+ var hadUnderlines = prevRow.some(function (col) { return col.attributes.includes('border') && col.attributes.includes('bottom'); });
249
+ var hasUnderline = row.some(function (col) { return col.attributes.includes('border') && col.attributes.includes('bottom'); });
250
+ if (hadUnderlines && !hasUnderline) {
251
+ return true;
252
+ }
253
+ return row.some(function (col) {
254
+ var valueParsed = _this.parseValue(col.html, options);
255
+ var isNumber = typeof valueParsed === 'number';
256
+ var isYear = isNumber && valueParsed > 1900 && valueParsed < 2100;
257
+ var isCol = isNumber && !isYear;
258
+ return isCol;
259
+ });
260
+ });
261
+ return bodyIndex - 1;
262
+ };
263
+ var getNextCell = function (row, colIndex) {
264
+ var startingCol = row[colIndex];
265
+ for (var i = colIndex; i < row.length; i++) {
266
+ if (row[i].tableCellIndex !== (startingCol === null || startingCol === void 0 ? void 0 : startingCol.tableCellIndex)) {
267
+ return row[i];
268
+ }
269
+ }
270
+ return null;
271
+ };
272
+ var completedCells = new Set();
273
+ var _loop_4 = function (table) {
274
+ var headerRowIndex = getHeaderRowIndex(table.rows);
275
+ var _loop_5 = function (rowIndex) {
276
+ var row = table.rows[rowIndex];
277
+ var countUniqueCells = new Set(row.map(function (c) { return c.tableCellIndex; })).size;
278
+ // skip titles in the middle of the body
279
+ var isBodyTitleRow = rowIndex > headerRowIndex && countUniqueCells === 1 && ((_a = row[0]) === null || _a === void 0 ? void 0 : _a.colSpan) > 0;
280
+ var isHeaderRow = rowIndex <= headerRowIndex;
281
+ var headerByIndex = new Map();
282
+ var getHeaderCol = function (c) {
283
+ var _a, _b, _c;
284
+ if (headerByIndex.has(c)) {
285
+ return (_a = headerByIndex.get(c)) !== null && _a !== void 0 ? _a : null;
286
+ }
287
+ if (isHeaderRow) {
288
+ return null;
289
+ }
290
+ for (var r = 0; r <= headerRowIndex; r++) {
291
+ var row_1 = table.rows[r];
292
+ for (var c_1 = 0; c_1 < row_1.length; c_1++) {
293
+ var col = row_1[c_1];
294
+ var headerCurrent = (_b = headerByIndex.get(c_1)) !== null && _b !== void 0 ? _b : '';
295
+ var value = headerCurrent.endsWith("".concat(col.valueParsed || ''))
296
+ ? headerCurrent
297
+ : "".concat(headerCurrent, " ").concat(col.valueParsed || '').trim();
298
+ headerByIndex.set(c_1, value);
299
+ }
300
+ }
301
+ return (_c = headerByIndex.get(c)) !== null && _c !== void 0 ? _c : null;
302
+ };
303
+ for (var colIndex = 0; colIndex < row.length; colIndex++) {
304
+ var cell = row[colIndex];
305
+ if (completedCells.has(cell)) {
306
+ continue;
307
+ }
308
+ cell.headerRowIndex = headerRowIndex > -1 ? headerRowIndex : null;
309
+ cell.isBodyTitleRowCell = isBodyTitleRow;
310
+ cell.isHeaderRowCell = isHeaderRow;
311
+ // sometimes there is a rogue percent sign that is not in a column, so we need to check the next column
312
+ var nextCell = getNextCell(row, colIndex);
313
+ // const isMissingPercentSign =
314
+ // nextCell?.html.includes('%') && this.parseValue(nextCell?.html) === null
315
+ var isMissingParenthesis = (nextCell === null || nextCell === void 0 ? void 0 : nextCell.html.includes(')')) && cell.html.includes('(') && !cell.html.includes(')');
316
+ var colValue = isMissingParenthesis ? "".concat(cell.html.trim(), ")") : cell.html.trim();
317
+ // colValue = isMissingPercentSign ? `${colValue}` : colValue
318
+ colValue = this_1.parseValue(colValue, options);
319
+ colValue = typeof colValue === 'string' ? colValue.replace(/\s+/g, ' ') : colValue;
320
+ // add parsed value
321
+ cell.valueParsed = colValue;
322
+ cell.headerCol = getHeaderCol(colIndex);
323
+ completedCells.add(cell);
324
+ }
325
+ };
326
+ for (var rowIndex = 0; rowIndex < table.rows.length; rowIndex++) {
327
+ _loop_5(rowIndex);
328
+ }
329
+ };
330
+ var this_1 = this;
331
+ for (var _i = 0, tables_4 = tables; _i < tables_4.length; _i++) {
332
+ var table = tables_4[_i];
333
+ _loop_4(table);
334
+ }
335
+ };
336
+ HtmlTableExtractor.prototype.stripHtml = function (str, options) {
337
+ var _a = options !== null && options !== void 0 ? options : {}, _b = _a.tagsToExclude, tagsToExclude = _b === void 0 ? [] : _b, _c = _a.tagsToInclude, tagsToInclude = _c === void 0 ? [] : _c;
338
+ var strNew = '';
339
+ var iterator = new XMLParser_1.default();
340
+ var isAllowedPath = true;
341
+ var updateAllowedPath = function (data) {
342
+ var openTags = data.path.split('.');
343
+ if (tagsToInclude.length > 0) {
344
+ isAllowedPath = tagsToInclude.some(function (t) { return openTags.includes(t); });
345
+ }
346
+ else if (tagsToExclude.length > 0) {
347
+ isAllowedPath = !tagsToExclude.some(function (t) { return openTags.includes(t); });
348
+ }
349
+ };
350
+ iterator.iterateXML({
351
+ xml: str,
352
+ onOpenTag: function (data) {
353
+ updateAllowedPath(data);
354
+ },
355
+ onCloseTag: function (data) {
356
+ strNew += isAllowedPath && !strNew.endsWith('\n') ? '\n' : '';
357
+ updateAllowedPath(data);
358
+ },
359
+ onCharacter: function (data) {
360
+ if (isAllowedPath) {
361
+ strNew += data.char;
362
+ }
363
+ },
364
+ });
365
+ return strNew;
366
+ };
367
+ HtmlTableExtractor.prototype.parseValue = function (str, options) {
368
+ var _a;
369
+ if (str === null)
370
+ return null;
371
+ if (typeof str === 'number')
372
+ return str;
373
+ var _b = options !== null && options !== void 0 ? options : {}, _c = _b.stripHtml, stripHtml = _c === void 0 ? true : _c, _d = _b.tagsToExclude, tagsToExclude = _d === void 0 ? [] : _d, _e = _b.tagsToInclude, tagsToInclude = _e === void 0 ? [] : _e, _f = _b.stripParenthesis, stripParenthesis = _f === void 0 ? false : _f;
374
+ var strNew = stripHtml ? this.stripHtml(str, { tagsToExclude: tagsToExclude, tagsToInclude: tagsToInclude }) : str;
375
+ var text = strNew
376
+ .replace(/&#160;|&nbsp;|\n/g, ' ')
377
+ .replace(/&#174;|&#9744;/g, '')
378
+ .replace(/&#8211;|&#8212;|&#x2014;|&#151;/g, '-')
379
+ .replace(/&#8217;|&#8220;|&#8221;|&rsquo;/g, "'");
380
+ if (stripParenthesis) {
381
+ text = text.replace(/\(.*?\)/g, '');
382
+ }
383
+ text = text
384
+ .replace(/\s+/, ' ')
385
+ .replace(/&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-fA-F]{1,6});/g, ' ')
386
+ .trim();
387
+ if (str.replace(/&#8211;|&#8212;|&#x2014;/g, '-') === '-')
388
+ return '-';
389
+ if (text === '')
390
+ return null;
391
+ var colNum = text.replace(/,|\(|\)|\%/g, '').trim();
392
+ if (colNum === '-' || colNum === '$')
393
+ return null;
394
+ colNum = colNum.replace(/\-|\$/g, '');
395
+ var hasNumBeforeParenthesis = Boolean(/\d+\s*(?=\()/.test(text));
396
+ colNum = hasNumBeforeParenthesis ? (_a = colNum.split(' ')[0]) === null || _a === void 0 ? void 0 : _a.trim() : colNum;
397
+ if (!isNaN(Number(colNum))) {
398
+ if (text.includes('%'))
399
+ return text.replace(/[^a-zA-Z\d\s:]/g, '') === '' ? null : text;
400
+ return (text.trim().includes('(') && !hasNumBeforeParenthesis) || text.includes('-')
401
+ ? Number(colNum) * -1
402
+ : Number(colNum);
403
+ }
404
+ return text;
405
+ };
406
+ return HtmlTableExtractor;
407
+ }());
408
+ exports.default = HtmlTableExtractor;
@@ -1,4 +1,3 @@
1
- import { DocumentNode } from './XMLNode/DocumentNode';
2
1
  interface OnCharacterData {
3
2
  char: string;
4
3
  index: number;
@@ -6,9 +5,7 @@ interface OnCharacterData {
6
5
  pathOccurrenceCount: number;
7
6
  attributesStr: string;
8
7
  }
9
- interface ParseTableNodesParams {
10
- xml: string;
11
- }
8
+ type ObjOrArr = Record<string | symbol | number, any> | any[];
12
9
  interface Parse2Params {
13
10
  xml: string;
14
11
  onCharacter?: (data: OnCharacterData) => void;
@@ -29,12 +26,30 @@ interface IterateTablesParams {
29
26
  textMap: Map<string, string>;
30
27
  }) => void;
31
28
  }
29
+ interface Parser {
30
+ parse(xml: string): Record<string, any>;
31
+ }
32
+ interface XMLParserArgs {
33
+ parser: Parser;
34
+ }
35
+ interface Section {
36
+ text: string;
37
+ tables: string[][][][];
38
+ }
32
39
  export default class XMLParser {
40
+ private readonly parser;
41
+ constructor(args?: XMLParserArgs);
42
+ extractJsonKey(json: Record<string, any>, extractKey: string): Record<string, any>;
43
+ parse(xml: string): any;
44
+ /**
45
+ * Return true in the callback if you want the loop to stop
46
+ */
47
+ iterateKeysDeep(obj: ObjOrArr, cb: (key: string, currentObj: ObjOrArr, originalObj: ObjOrArr) => void | boolean): void;
48
+ parseSectionTables(xml: string): Section[];
33
49
  iterateXML(params: Parse2Params): string[];
34
50
  /**
35
51
  * Returns text in each table cell mapped by `${table}.${row}.${col}`
36
52
  */
37
53
  getTableTextMap(params: IterateTablesParams): Map<string, string>;
38
- getDocumentNode(params: ParseTableNodesParams): DocumentNode;
39
54
  }
40
55
  export {};