@nahisaho/katashiro-collector 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/dist/browser/ActionExecutor.d.ts +85 -0
  2. package/dist/browser/ActionExecutor.d.ts.map +1 -0
  3. package/dist/browser/ActionExecutor.js +171 -0
  4. package/dist/browser/ActionExecutor.js.map +1 -0
  5. package/dist/browser/BrowserAutomation.d.ts +147 -0
  6. package/dist/browser/BrowserAutomation.d.ts.map +1 -0
  7. package/dist/browser/BrowserAutomation.js +463 -0
  8. package/dist/browser/BrowserAutomation.js.map +1 -0
  9. package/dist/browser/ContentExtractor.d.ts +54 -0
  10. package/dist/browser/ContentExtractor.d.ts.map +1 -0
  11. package/dist/browser/ContentExtractor.js +159 -0
  12. package/dist/browser/ContentExtractor.js.map +1 -0
  13. package/dist/browser/SessionManager.d.ts +67 -0
  14. package/dist/browser/SessionManager.d.ts.map +1 -0
  15. package/dist/browser/SessionManager.js +173 -0
  16. package/dist/browser/SessionManager.js.map +1 -0
  17. package/dist/browser/index.d.ts +17 -0
  18. package/dist/browser/index.d.ts.map +1 -0
  19. package/dist/browser/index.js +17 -0
  20. package/dist/browser/index.js.map +1 -0
  21. package/dist/browser/types.d.ts +361 -0
  22. package/dist/browser/types.d.ts.map +1 -0
  23. package/dist/browser/types.js +23 -0
  24. package/dist/browser/types.js.map +1 -0
  25. package/dist/document/DocumentParser.d.ts +91 -0
  26. package/dist/document/DocumentParser.d.ts.map +1 -0
  27. package/dist/document/DocumentParser.js +234 -0
  28. package/dist/document/DocumentParser.js.map +1 -0
  29. package/dist/document/index.d.ts +11 -0
  30. package/dist/document/index.d.ts.map +1 -0
  31. package/dist/document/index.js +10 -0
  32. package/dist/document/index.js.map +1 -0
  33. package/dist/document/parsers/DOCXParser.d.ts +63 -0
  34. package/dist/document/parsers/DOCXParser.d.ts.map +1 -0
  35. package/dist/document/parsers/DOCXParser.js +362 -0
  36. package/dist/document/parsers/DOCXParser.js.map +1 -0
  37. package/dist/document/parsers/PDFParser.d.ts +60 -0
  38. package/dist/document/parsers/PDFParser.d.ts.map +1 -0
  39. package/dist/document/parsers/PDFParser.js +338 -0
  40. package/dist/document/parsers/PDFParser.js.map +1 -0
  41. package/dist/document/parsers/XLSXParser.d.ts +55 -0
  42. package/dist/document/parsers/XLSXParser.d.ts.map +1 -0
  43. package/dist/document/parsers/XLSXParser.js +314 -0
  44. package/dist/document/parsers/XLSXParser.js.map +1 -0
  45. package/dist/document/parsers/index.d.ts +10 -0
  46. package/dist/document/parsers/index.d.ts.map +1 -0
  47. package/dist/document/parsers/index.js +10 -0
  48. package/dist/document/parsers/index.js.map +1 -0
  49. package/dist/document/types.d.ts +251 -0
  50. package/dist/document/types.d.ts.map +1 -0
  51. package/dist/document/types.js +13 -0
  52. package/dist/document/types.js.map +1 -0
  53. package/dist/index.d.ts +7 -2
  54. package/dist/index.d.ts.map +1 -1
  55. package/dist/index.js +14 -2
  56. package/dist/index.js.map +1 -1
  57. package/dist/research/CoverageAnalyzer.d.ts +50 -0
  58. package/dist/research/CoverageAnalyzer.d.ts.map +1 -0
  59. package/dist/research/CoverageAnalyzer.js +169 -0
  60. package/dist/research/CoverageAnalyzer.js.map +1 -0
  61. package/dist/research/QueryPlanner.d.ts +57 -0
  62. package/dist/research/QueryPlanner.d.ts.map +1 -0
  63. package/dist/research/QueryPlanner.js +102 -0
  64. package/dist/research/QueryPlanner.js.map +1 -0
  65. package/dist/research/ResultAggregator.d.ts +39 -0
  66. package/dist/research/ResultAggregator.d.ts.map +1 -0
  67. package/dist/research/ResultAggregator.js +85 -0
  68. package/dist/research/ResultAggregator.js.map +1 -0
  69. package/dist/research/WideResearchEngine.d.ts +110 -0
  70. package/dist/research/WideResearchEngine.d.ts.map +1 -0
  71. package/dist/research/WideResearchEngine.js +330 -0
  72. package/dist/research/WideResearchEngine.js.map +1 -0
  73. package/dist/research/agents/AcademicSearchAgent.d.ts +57 -0
  74. package/dist/research/agents/AcademicSearchAgent.d.ts.map +1 -0
  75. package/dist/research/agents/AcademicSearchAgent.js +180 -0
  76. package/dist/research/agents/AcademicSearchAgent.js.map +1 -0
  77. package/dist/research/agents/EncyclopediaAgent.d.ts +49 -0
  78. package/dist/research/agents/EncyclopediaAgent.d.ts.map +1 -0
  79. package/dist/research/agents/EncyclopediaAgent.js +153 -0
  80. package/dist/research/agents/EncyclopediaAgent.js.map +1 -0
  81. package/dist/research/agents/NewsSearchAgent.d.ts +38 -0
  82. package/dist/research/agents/NewsSearchAgent.d.ts.map +1 -0
  83. package/dist/research/agents/NewsSearchAgent.js +146 -0
  84. package/dist/research/agents/NewsSearchAgent.js.map +1 -0
  85. package/dist/research/agents/WebSearchAgent.d.ts +45 -0
  86. package/dist/research/agents/WebSearchAgent.d.ts.map +1 -0
  87. package/dist/research/agents/WebSearchAgent.js +135 -0
  88. package/dist/research/agents/WebSearchAgent.js.map +1 -0
  89. package/dist/research/agents/index.d.ts +13 -0
  90. package/dist/research/agents/index.d.ts.map +1 -0
  91. package/dist/research/agents/index.js +12 -0
  92. package/dist/research/agents/index.js.map +1 -0
  93. package/dist/research/agents/types.d.ts +60 -0
  94. package/dist/research/agents/types.d.ts.map +1 -0
  95. package/dist/research/agents/types.js +9 -0
  96. package/dist/research/agents/types.js.map +1 -0
  97. package/dist/research/index.d.ts +16 -0
  98. package/dist/research/index.d.ts.map +1 -0
  99. package/dist/research/index.js +17 -0
  100. package/dist/research/index.js.map +1 -0
  101. package/dist/research/types.d.ts +206 -0
  102. package/dist/research/types.d.ts.map +1 -0
  103. package/dist/research/types.js +33 -0
  104. package/dist/research/types.js.map +1 -0
  105. package/package.json +1 -1
@@ -0,0 +1,234 @@
1
+ /**
2
+ * ドキュメントパーサー
3
+ *
4
+ * @design DES-COLLECT-003 §2.3
5
+ * @task TASK-001-2
6
+ */
7
+ import * as fs from 'fs/promises';
8
+ import * as path from 'path';
9
+ import { err } from '@nahisaho/katashiro-core';
10
+ import { PDFParser } from './parsers/PDFParser.js';
11
+ import { DOCXParser } from './parsers/DOCXParser.js';
12
+ import { XLSXParser } from './parsers/XLSXParser.js';
13
+ import { DEFAULT_PARSE_OPTIONS } from './types.js';
14
+ /**
15
+ * ドキュメントパーサーのファサード
16
+ *
17
+ * PDF、Word(DOCX)、Excel(XLSX)ファイルからテキストと構造を抽出します。
18
+ *
19
+ * @example
20
+ * ```typescript
21
+ * import { DocumentParser, isOk } from '@nahisaho/katashiro-collector';
22
+ *
23
+ * const parser = new DocumentParser();
24
+ *
25
+ * // PDFを解析
26
+ * const result = await parser.parse('./document.pdf');
27
+ * if (isOk(result)) {
28
+ * console.log(result.value.content);
29
+ * console.log(result.value.structure.headings);
30
+ * }
31
+ *
32
+ * // Excelを解析(特定シートのみ)
33
+ * const excelResult = await parser.parse('./data.xlsx', {
34
+ * sheetNames: ['Sheet1', 'Summary'],
35
+ * extractTables: true,
36
+ * });
37
+ *
38
+ * // バッファから解析
39
+ * const buffer = await fs.readFile('./document.pdf');
40
+ * const bufferResult = await parser.parseBuffer(buffer, 'document.pdf');
41
+ * ```
42
+ */
43
+ export class DocumentParser {
44
+ parsers;
45
+ constructor() {
46
+ this.parsers = new Map();
47
+ this.registerDefaultParsers();
48
+ }
49
+ /**
50
+ * デフォルトパーサーを登録
51
+ */
52
+ registerDefaultParsers() {
53
+ const pdfParser = new PDFParser();
54
+ const docxParser = new DOCXParser();
55
+ const xlsxParser = new XLSXParser();
56
+ // 拡張子でマッピング
57
+ this.parsers.set('.pdf', pdfParser);
58
+ this.parsers.set('.docx', docxParser);
59
+ this.parsers.set('.xlsx', xlsxParser);
60
+ }
61
+ /**
62
+ * カスタムパーサーを登録
63
+ *
64
+ * @param extension - ファイル拡張子(例: '.pptx')
65
+ * @param parser - パーサー実装
66
+ */
67
+ registerParser(extension, parser) {
68
+ this.parsers.set(extension.toLowerCase(), parser);
69
+ }
70
+ /**
71
+ * ファイルパスからドキュメントを解析
72
+ *
73
+ * @param filePath - ファイルの絶対または相対パス
74
+ * @param options - パースオプション
75
+ * @returns 解析結果またはエラー
76
+ */
77
+ async parse(filePath, options = {}) {
78
+ const mergedOptions = { ...DEFAULT_PARSE_OPTIONS, ...options };
79
+ // ファイル存在確認
80
+ try {
81
+ await fs.access(filePath);
82
+ }
83
+ catch {
84
+ return err({
85
+ code: 'FILE_NOT_FOUND',
86
+ message: `File not found: ${filePath}`,
87
+ filePath,
88
+ });
89
+ }
90
+ // ファイルサイズ確認
91
+ let stats;
92
+ try {
93
+ stats = await fs.stat(filePath);
94
+ }
95
+ catch (error) {
96
+ return err({
97
+ code: 'PERMISSION_DENIED',
98
+ message: `Cannot access file: ${filePath}`,
99
+ filePath,
100
+ details: error,
101
+ });
102
+ }
103
+ if (stats.size > mergedOptions.maxFileSize) {
104
+ return err({
105
+ code: 'FILE_TOO_LARGE',
106
+ message: `File size ${stats.size} exceeds maximum ${mergedOptions.maxFileSize}`,
107
+ filePath,
108
+ });
109
+ }
110
+ // パーサー選択
111
+ const ext = path.extname(filePath).toLowerCase();
112
+ const parser = this.parsers.get(ext);
113
+ if (!parser) {
114
+ return err({
115
+ code: 'UNSUPPORTED_FORMAT',
116
+ message: `Unsupported file format: ${ext}`,
117
+ filePath,
118
+ });
119
+ }
120
+ // タイムアウト付きで実行
121
+ return this.withTimeout(parser.parse(filePath, mergedOptions), mergedOptions.timeout, filePath);
122
+ }
123
+ /**
124
+ * バッファからドキュメントを解析
125
+ *
126
+ * @param buffer - ファイルのバイナリデータ
127
+ * @param filename - ファイル名(MIME タイプ判定用)
128
+ * @param options - パースオプション
129
+ * @returns 解析結果またはエラー
130
+ */
131
+ async parseBuffer(buffer, filename, options = {}) {
132
+ const mergedOptions = { ...DEFAULT_PARSE_OPTIONS, ...options };
133
+ if (buffer.length > mergedOptions.maxFileSize) {
134
+ return err({
135
+ code: 'FILE_TOO_LARGE',
136
+ message: `Buffer size ${buffer.length} exceeds maximum ${mergedOptions.maxFileSize}`,
137
+ });
138
+ }
139
+ const ext = path.extname(filename).toLowerCase();
140
+ const parser = this.parsers.get(ext);
141
+ if (!parser) {
142
+ return err({
143
+ code: 'UNSUPPORTED_FORMAT',
144
+ message: `Unsupported file format: ${ext}`,
145
+ });
146
+ }
147
+ return this.withTimeout(parser.parseBuffer(buffer, filename, mergedOptions), mergedOptions.timeout, filename);
148
+ }
149
+ /**
150
+ * ストリームからドキュメントを解析
151
+ *
152
+ * @param stream - 読み取り可能ストリーム
153
+ * @param filename - ファイル名
154
+ * @param options - パースオプション
155
+ * @returns 解析結果またはエラー
156
+ */
157
+ async parseStream(stream, filename, options = {}) {
158
+ const mergedOptions = { ...DEFAULT_PARSE_OPTIONS, ...options };
159
+ const chunks = [];
160
+ let totalSize = 0;
161
+ let resolved = false;
162
+ return new Promise((resolve) => {
163
+ stream.on('data', (chunk) => {
164
+ if (resolved)
165
+ return;
166
+ totalSize += chunk.length;
167
+ if (totalSize > mergedOptions.maxFileSize) {
168
+ resolved = true;
169
+ // @ts-expect-error: destroy may not exist on all ReadableStream types
170
+ if (typeof stream.destroy === 'function')
171
+ stream.destroy();
172
+ resolve(err({
173
+ code: 'FILE_TOO_LARGE',
174
+ message: `Stream size exceeds maximum ${mergedOptions.maxFileSize}`,
175
+ }));
176
+ return;
177
+ }
178
+ chunks.push(Buffer.from(chunk));
179
+ });
180
+ stream.on('error', (error) => {
181
+ resolve(err({
182
+ code: 'PARSE_ERROR',
183
+ message: `Stream read error: ${error.message}`,
184
+ }));
185
+ });
186
+ stream.on('end', async () => {
187
+ const buffer = Buffer.concat(chunks);
188
+ const result = await this.parseBuffer(buffer, filename, options);
189
+ resolve(result);
190
+ });
191
+ });
192
+ }
193
+ /**
194
+ * サポートするファイル形式を取得
195
+ */
196
+ getSupportedFormats() {
197
+ return [
198
+ { extension: '.pdf', mimeType: 'application/pdf', description: 'PDF Document' },
199
+ {
200
+ extension: '.docx',
201
+ mimeType: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
202
+ description: 'Word Document',
203
+ },
204
+ {
205
+ extension: '.xlsx',
206
+ mimeType: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
207
+ description: 'Excel Spreadsheet',
208
+ },
209
+ ];
210
+ }
211
+ /**
212
+ * ファイルがサポートされているか確認
213
+ */
214
+ isSupported(filename) {
215
+ const ext = path.extname(filename).toLowerCase();
216
+ return this.parsers.has(ext);
217
+ }
218
+ /**
219
+ * タイムアウト付きで実行
220
+ */
221
+ async withTimeout(promise, timeout, filePath) {
222
+ const timeoutPromise = new Promise((resolve) => {
223
+ setTimeout(() => {
224
+ resolve(err({
225
+ code: 'TIMEOUT',
226
+ message: `Operation timed out after ${timeout}ms`,
227
+ filePath,
228
+ }));
229
+ }, timeout);
230
+ });
231
+ return Promise.race([promise, timeoutPromise]);
232
+ }
233
+ }
234
+ //# sourceMappingURL=DocumentParser.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"DocumentParser.js","sourceRoot":"","sources":["../../src/document/DocumentParser.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,MAAM,aAAa,CAAC;AAClC,OAAO,KAAK,IAAI,MAAM,MAAM,CAAC;AAC7B,OAAO,EAAE,GAAG,EAAe,MAAM,0BAA0B,CAAC;AAC5D,OAAO,EAAE,SAAS,EAAE,MAAM,wBAAwB,CAAC;AACnD,OAAO,EAAE,UAAU,EAAE,MAAM,yBAAyB,CAAC;AACrD,OAAO,EAAE,UAAU,EAAE,MAAM,yBAAyB,CAAC;AAQrD,OAAO,EAAE,qBAAqB,EAAE,MAAM,YAAY,CAAC;AAEnD;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA4BG;AACH,MAAM,OAAO,cAAc;IACjB,OAAO,CAA+B;IAE9C;QACE,IAAI,CAAC,OAAO,GAAG,IAAI,GAAG,EAAE,CAAC;QACzB,IAAI,CAAC,sBAAsB,EAAE,CAAC;IAChC,CAAC;IAED;;OAEG;IACK,sBAAsB;QAC5B,MAAM,SAAS,GAAG,IAAI,SAAS,EAAE,CAAC;QAClC,MAAM,UAAU,GAAG,IAAI,UAAU,EAAE,CAAC;QACpC,MAAM,UAAU,GAAG,IAAI,UAAU,EAAE,CAAC;QAEpC,YAAY;QACZ,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;QACpC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,OAAO,EAAE,UAAU,CAAC,CAAC;QACtC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,OAAO,EAAE,UAAU,CAAC,CAAC;IACxC,CAAC;IAED;;;;;OAKG;IACH,cAAc,CAAC,SAAiB,EAAE,MAAuB;QACvD,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,SAAS,CAAC,WAAW,EAAE,EAAE,MAAM,CAAC,CAAC;IACpD,CAAC;IAED;;;;;;OAMG;IACH,KAAK,CAAC,KAAK,CACT,QAAgB,EAChB,UAAwB,EAAE;QAE1B,MAAM,aAAa,GAAG,EAAE,GAAG,qBAAqB,EAAE,GAAG,OAAO,EAAE,CAAC;QAE/D,WAAW;QACX,IAAI,CAAC;YACH,MAAM,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;QAC5B,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,GAAG,CAAC;gBACT,IAAI,EAAE,gBAAgB;gBACtB,OAAO,EAAE,mBAAmB,QAAQ,EAAE;gBACtC,QAAQ;aACT,CAAC,CAAC;QACL,CAAC;QAED,YAAY;QACZ,IAAI,KAAK,CAAC;QACV,IAAI,CAAC;YACH,KAAK,GAAG,MAAM,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAClC,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,GAAG,CAAC;gBACT,IAAI,EAAE,mBAAmB;gBACzB,OAAO,EAAE,uBAAuB,QAAQ,EAAE;gBAC1C,QAAQ;gBACR,OAAO,EAAE,KAAK;aACf,CAAC,CAAC;QACL,CAAC;QAED,IAAI,KAAK,CAAC,IAAI,GAAG,aAAa,CAAC,WAAW,EAAE,CAAC;YAC3C,OAAO,GAAG,CAAC;gBACT,IAAI,EAAE,gBAAgB;gBACtB,OAAO,EAAE,aAAa,KAAK,CAAC,IAAI,oBAAoB,aAAa,CAAC,WAAW,EAAE;gBAC/E,QAAQ;aACT,CAAC,CAAC;QACL,CAAC;QAED,SAAS;QACT,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAC;QACjD,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QAErC,IAAI,CAAC,MAAM,EAAE,CAAC;YACZ,OAAO,GAAG,CAAC;gBACT,IAAI,EAAE,oBAAoB;gBAC1B,OAAO,EAAE,4BAA4B,GAAG,EAAE;gBAC1C,QAAQ;aACT,CAAC,CAAC;QACL,CAAC;QAED,cAAc;QACd,OAAO,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,KAAK,CAAC,QAAQ,EAAE,aAAa,CAAC,EAAE,aAAa,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAC;IAClG,CAAC;IAED;;;;;;;OAOG;IACH,KAAK,CAAC,WAAW,CACf,MAAc,EACd,QAAgB,EAChB,UAAwB,EAAE;QAE1B,MAAM,aAAa,GAAG,EAAE,GAAG,qBAAqB,EAAE,GAAG,OAAO,EAAE,CAAC;QAE/D,IAAI,MAAM,CAAC,MAAM,GAAG,aAAa,CAAC,WAAW,EAAE,CAAC;YAC9C,OAAO,GAAG,CAAC;gBACT,IAAI,EAAE,gBAAgB;gBACtB,OAAO,EAAE,eAAe,MAAM,CAAC,MAAM,oBAAoB,aAAa,CAAC,WAAW,EAAE;aACrF,CAAC,CAAC;QACL,CAAC;QAED,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAC;QACjD,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QAErC,IAAI,CAAC,MAAM,EAAE,CAAC;YACZ,OAAO,GAAG,CAAC;gBACT,IAAI,EAAE,oBAAoB;gBAC1B,OAAO,EAAE,4BAA4B,GAAG,EAAE;aAC3C,CAAC,CAAC;QACL,CAAC;QAED,OAAO,IAAI,CAAC,WAAW,CACrB,MAAM,CAAC,WAAW,CAAC,MAAM,EAAE,QAAQ,EAAE,aAAa,CAAC,EACnD,aAAa,CAAC,OAAO,EACrB,QAAQ,CACT,CAAC;IACJ,CAAC;IAED;;;;;;;OAOG;IACH,KAAK,CAAC,WAAW,CACf,MAA6B,EAC7B,QAAgB,EAChB,UAAwB,EAAE;QAE1B,MAAM,aAAa,GAAG,EAAE,GAAG,qBAAqB,EAAE,GAAG,OAAO,EAAE,CAAC;QAC/D,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,IAAI,SAAS,GAAG,CAAC,CAAC;QAClB,IAAI,QAAQ,GAAG,KAAK,CAAC;QAErB,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE;YAC7B,MAAM,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,KAAa,EAAE,EAAE;gBAClC,IAAI,QAAQ;oBAAE,OAAO;gBACrB,SAAS,IAAI,KAAK,CAAC,MAAM,CAAC;gBAC1B,IAAI,SAAS,GAAG,aAAa,CAAC,WAAW,EAAE,CAAC;oBAC1C,QAAQ,GAAG,IAAI,CAAC;oBAChB,sEAAsE;oBACtE,IAAI,OAAO,MAAM,CAAC,OAAO,KAAK,UAAU;wBAAE,MAAM,CAAC,OAAO,EAAE,CAAC;oBAC3D,OAAO,CACL,GAAG,CAAC;wBACF,IAAI,EAAE,gBAAgB;wBACtB,OAAO,EAAE,+BAA+B,aAAa,CAAC,WAAW,EAAE;qBACpE,CAAC,CACH,CAAC;oBACF,OAAO;gBACT,CAAC;gBACD,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;YAClC,CAAC,CAAC,CAAC;YAEH,MAAM,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,KAAK,EAAE,EAAE;gBAC3B,OAAO,CACL,GAAG,CAAC;oBACF,IAAI,EAAE,aAAa;oBACnB,OAAO,EAAE,sBAAsB,KAAK,CAAC,OAAO,EAAE;iBAC/C,CAAC,CACH,CAAC;YACJ,CAAC,CAAC,CAAC;YAEH,MAAM,CAAC,EAAE,CAAC,KAAK,EAAE,KAAK,IAAI,EAAE;gBAC1B,MAAM,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;gBACrC,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,QAAQ,EAAE,OAAO,CAAC,CAAC;gBACjE,OAAO,CAAC,MAAM,CAAC,CAAC;YAClB,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACH,mBAAmB;QACjB,OAAO;YACL,EAAE,SAAS,EAAE,MAAM,EAAE,QAAQ,EAAE,iBAAiB,EAAE,WAAW,EAAE,cAAc,EAAE;YAC/E;gBACE,SAAS,EAAE,OAAO;gBAClB,QAAQ,EAAE,yEAAyE;gBACnF,WAAW,EAAE,eAAe;aAC7B;YACD;gBACE,SAAS,EAAE,OAAO;gBAClB,QAAQ,EAAE,mEAAmE;gBAC7E,WAAW,EAAE,mBAAmB;aACjC;SACF,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,WAAW,CAAC,QAAgB;QAC1B,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAC;QACjD,OAAO,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;IAC/B,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,WAAW,CACvB,OAA0C,EAC1C,OAAe,EACf,QAAiB;QAEjB,MAAM,cAAc,GAAG,IAAI,OAAO,CAA2B,CAAC,OAAO,EAAE,EAAE;YACvE,UAAU,CAAC,GAAG,EAAE;gBACd,OAAO,CACL,GAAG,CAAC;oBACF,IAAI,EAAE,SAAS;oBACf,OAAO,EAAE,6BAA6B,OAAO,IAAI;oBACjD,QAAQ;iBACT,CAAC,CACH,CAAC;YACJ,CAAC,EAAE,OAAO,CAAC,CAAC;QACd,CAAC,CAAC,CAAC;QAEH,OAAO,OAAO,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,cAAc,CAAC,CAAC,CAAC;IACjD,CAAC;CACF"}
@@ -0,0 +1,11 @@
1
+ /**
2
+ * ドキュメントモジュール
3
+ *
4
+ * @design DES-COLLECT-003
5
+ * @task TASK-001
6
+ */
7
+ export { DocumentParser } from './DocumentParser.js';
8
+ export { PDFParser, DOCXParser, XLSXParser } from './parsers/index.js';
9
+ export type { ParsedDocument, DocumentError, DocumentErrorCode, ParseOptions, SupportedFormat, IDocumentParser, DocumentStructure, DocumentMetadata, Heading, Paragraph, ParagraphStyle, Section, TableOfContents, TocEntry, TableData, TableRow, TableCell, ImageReference, PageInfo, SheetInfo, } from './types.js';
10
+ export { DEFAULT_PARSE_OPTIONS } from './types.js';
11
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/document/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AACrD,OAAO,EAAE,SAAS,EAAE,UAAU,EAAE,UAAU,EAAE,MAAM,oBAAoB,CAAC;AACvE,YAAY,EAEV,cAAc,EACd,aAAa,EACb,iBAAiB,EACjB,YAAY,EACZ,eAAe,EACf,eAAe,EAEf,iBAAiB,EACjB,gBAAgB,EAChB,OAAO,EACP,SAAS,EACT,cAAc,EACd,OAAO,EACP,eAAe,EACf,QAAQ,EAER,SAAS,EACT,QAAQ,EACR,SAAS,EACT,cAAc,EACd,QAAQ,EACR,SAAS,GACV,MAAM,YAAY,CAAC;AACpB,OAAO,EAAE,qBAAqB,EAAE,MAAM,YAAY,CAAC"}
@@ -0,0 +1,10 @@
1
+ /**
2
+ * ドキュメントモジュール
3
+ *
4
+ * @design DES-COLLECT-003
5
+ * @task TASK-001
6
+ */
7
+ export { DocumentParser } from './DocumentParser.js';
8
+ export { PDFParser, DOCXParser, XLSXParser } from './parsers/index.js';
9
+ export { DEFAULT_PARSE_OPTIONS } from './types.js';
10
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/document/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AACrD,OAAO,EAAE,SAAS,EAAE,UAAU,EAAE,UAAU,EAAE,MAAM,oBAAoB,CAAC;AA0BvE,OAAO,EAAE,qBAAqB,EAAE,MAAM,YAAY,CAAC"}
@@ -0,0 +1,63 @@
1
+ /**
2
+ * DOCXパーサー
3
+ *
4
+ * @design DES-COLLECT-003 §2.3
5
+ * @task TASK-001-4
6
+ */
7
+ import { type Result } from '@nahisaho/katashiro-core';
8
+ import type { IDocumentParser, ParsedDocument, DocumentError, ParseOptions, SupportedFormat } from '../types.js';
9
+ /**
10
+ * DOCXパーサー実装
11
+ *
12
+ * @example
13
+ * ```typescript
14
+ * const parser = new DOCXParser();
15
+ * const result = await parser.parse('./document.docx');
16
+ * ```
17
+ */
18
+ export declare class DOCXParser implements IDocumentParser {
19
+ private mammoth;
20
+ private loadMammoth;
21
+ parse(filePath: string, options?: ParseOptions): Promise<Result<ParsedDocument, DocumentError>>;
22
+ parseBuffer(buffer: Buffer, filename: string, options?: ParseOptions): Promise<Result<ParsedDocument, DocumentError>>;
23
+ parseStream(stream: NodeJS.ReadableStream, filename: string, options?: ParseOptions): Promise<Result<ParsedDocument, DocumentError>>;
24
+ getSupportedFormats(): SupportedFormat[];
25
+ isSupported(filename: string): boolean;
26
+ /**
27
+ * 構造を抽出
28
+ */
29
+ private extractStructure;
30
+ /**
31
+ * HTMLから見出しを抽出
32
+ */
33
+ private extractHeadings;
34
+ /**
35
+ * 段落を抽出
36
+ */
37
+ private extractParagraphs;
38
+ /**
39
+ * セクションを構築
40
+ */
41
+ private buildSections;
42
+ /**
43
+ * HTMLからテーブルを抽出
44
+ */
45
+ private extractTables;
46
+ /**
47
+ * HTMLから画像参照を抽出
48
+ */
49
+ private extractImages;
50
+ /**
51
+ * HTMLタグを除去
52
+ */
53
+ private stripHtml;
54
+ /**
55
+ * メタデータを抽出
56
+ */
57
+ private extractMetadata;
58
+ /**
59
+ * ファイル名からMIMEタイプを推測
60
+ */
61
+ private guessMimeType;
62
+ }
63
+ //# sourceMappingURL=DOCXParser.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"DOCXParser.d.ts","sourceRoot":"","sources":["../../../src/document/parsers/DOCXParser.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAW,KAAK,MAAM,EAAE,MAAM,0BAA0B,CAAC;AAChE,OAAO,KAAK,EACV,eAAe,EACf,cAAc,EACd,aAAa,EACb,YAAY,EACZ,eAAe,EAQhB,MAAM,aAAa,CAAC;AAmBrB;;;;;;;;GAQG;AACH,qBAAa,UAAW,YAAW,eAAe;IAChD,OAAO,CAAC,OAAO,CAA8B;YAE/B,WAAW;IAYnB,KAAK,CACT,QAAQ,EAAE,MAAM,EAChB,OAAO,GAAE,YAAiB,GACzB,OAAO,CAAC,MAAM,CAAC,cAAc,EAAE,aAAa,CAAC,CAAC;IAgC3C,WAAW,CACf,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,MAAM,EAChB,OAAO,GAAE,YAAiB,GACzB,OAAO,CAAC,MAAM,CAAC,cAAc,EAAE,aAAa,CAAC,CAAC;IAyE3C,WAAW,CACf,MAAM,EAAE,MAAM,CAAC,cAAc,EAC7B,QAAQ,EAAE,MAAM,EAChB,OAAO,GAAE,YAAiB,GACzB,OAAO,CAAC,MAAM,CAAC,cAAc,EAAE,aAAa,CAAC,CAAC;IAqBjD,mBAAmB,IAAI,eAAe,EAAE;IAUxC,WAAW,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO;IAItC;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAYxB;;OAEG;IACH,OAAO,CAAC,eAAe;IAqBvB;;OAEG;IACH,OAAO,CAAC,iBAAiB;IAoBzB;;OAEG;IACH,OAAO,CAAC,aAAa;IAsDrB;;OAEG;IACH,OAAO,CAAC,aAAa;IAiDrB;;OAEG;IACH,OAAO,CAAC,aAAa;IAkCrB;;OAEG;IACH,OAAO,CAAC,SAAS;IAIjB;;OAEG;IACH,OAAO,CAAC,eAAe;IAcvB;;OAEG;IACH,OAAO,CAAC,aAAa;CAYtB"}