@nahisaho/katashiro-collector 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/dist/browser/ActionExecutor.d.ts +85 -0
  2. package/dist/browser/ActionExecutor.d.ts.map +1 -0
  3. package/dist/browser/ActionExecutor.js +171 -0
  4. package/dist/browser/ActionExecutor.js.map +1 -0
  5. package/dist/browser/BrowserAutomation.d.ts +147 -0
  6. package/dist/browser/BrowserAutomation.d.ts.map +1 -0
  7. package/dist/browser/BrowserAutomation.js +463 -0
  8. package/dist/browser/BrowserAutomation.js.map +1 -0
  9. package/dist/browser/ContentExtractor.d.ts +54 -0
  10. package/dist/browser/ContentExtractor.d.ts.map +1 -0
  11. package/dist/browser/ContentExtractor.js +159 -0
  12. package/dist/browser/ContentExtractor.js.map +1 -0
  13. package/dist/browser/SessionManager.d.ts +67 -0
  14. package/dist/browser/SessionManager.d.ts.map +1 -0
  15. package/dist/browser/SessionManager.js +173 -0
  16. package/dist/browser/SessionManager.js.map +1 -0
  17. package/dist/browser/index.d.ts +17 -0
  18. package/dist/browser/index.d.ts.map +1 -0
  19. package/dist/browser/index.js +17 -0
  20. package/dist/browser/index.js.map +1 -0
  21. package/dist/browser/types.d.ts +361 -0
  22. package/dist/browser/types.d.ts.map +1 -0
  23. package/dist/browser/types.js +23 -0
  24. package/dist/browser/types.js.map +1 -0
  25. package/dist/document/DocumentParser.d.ts +91 -0
  26. package/dist/document/DocumentParser.d.ts.map +1 -0
  27. package/dist/document/DocumentParser.js +234 -0
  28. package/dist/document/DocumentParser.js.map +1 -0
  29. package/dist/document/index.d.ts +11 -0
  30. package/dist/document/index.d.ts.map +1 -0
  31. package/dist/document/index.js +10 -0
  32. package/dist/document/index.js.map +1 -0
  33. package/dist/document/parsers/DOCXParser.d.ts +63 -0
  34. package/dist/document/parsers/DOCXParser.d.ts.map +1 -0
  35. package/dist/document/parsers/DOCXParser.js +362 -0
  36. package/dist/document/parsers/DOCXParser.js.map +1 -0
  37. package/dist/document/parsers/PDFParser.d.ts +60 -0
  38. package/dist/document/parsers/PDFParser.d.ts.map +1 -0
  39. package/dist/document/parsers/PDFParser.js +338 -0
  40. package/dist/document/parsers/PDFParser.js.map +1 -0
  41. package/dist/document/parsers/XLSXParser.d.ts +55 -0
  42. package/dist/document/parsers/XLSXParser.d.ts.map +1 -0
  43. package/dist/document/parsers/XLSXParser.js +314 -0
  44. package/dist/document/parsers/XLSXParser.js.map +1 -0
  45. package/dist/document/parsers/index.d.ts +10 -0
  46. package/dist/document/parsers/index.d.ts.map +1 -0
  47. package/dist/document/parsers/index.js +10 -0
  48. package/dist/document/parsers/index.js.map +1 -0
  49. package/dist/document/types.d.ts +251 -0
  50. package/dist/document/types.d.ts.map +1 -0
  51. package/dist/document/types.js +13 -0
  52. package/dist/document/types.js.map +1 -0
  53. package/dist/index.d.ts +7 -2
  54. package/dist/index.d.ts.map +1 -1
  55. package/dist/index.js +14 -2
  56. package/dist/index.js.map +1 -1
  57. package/dist/research/CoverageAnalyzer.d.ts +50 -0
  58. package/dist/research/CoverageAnalyzer.d.ts.map +1 -0
  59. package/dist/research/CoverageAnalyzer.js +169 -0
  60. package/dist/research/CoverageAnalyzer.js.map +1 -0
  61. package/dist/research/QueryPlanner.d.ts +57 -0
  62. package/dist/research/QueryPlanner.d.ts.map +1 -0
  63. package/dist/research/QueryPlanner.js +102 -0
  64. package/dist/research/QueryPlanner.js.map +1 -0
  65. package/dist/research/ResultAggregator.d.ts +39 -0
  66. package/dist/research/ResultAggregator.d.ts.map +1 -0
  67. package/dist/research/ResultAggregator.js +85 -0
  68. package/dist/research/ResultAggregator.js.map +1 -0
  69. package/dist/research/WideResearchEngine.d.ts +110 -0
  70. package/dist/research/WideResearchEngine.d.ts.map +1 -0
  71. package/dist/research/WideResearchEngine.js +330 -0
  72. package/dist/research/WideResearchEngine.js.map +1 -0
  73. package/dist/research/agents/AcademicSearchAgent.d.ts +57 -0
  74. package/dist/research/agents/AcademicSearchAgent.d.ts.map +1 -0
  75. package/dist/research/agents/AcademicSearchAgent.js +180 -0
  76. package/dist/research/agents/AcademicSearchAgent.js.map +1 -0
  77. package/dist/research/agents/EncyclopediaAgent.d.ts +49 -0
  78. package/dist/research/agents/EncyclopediaAgent.d.ts.map +1 -0
  79. package/dist/research/agents/EncyclopediaAgent.js +153 -0
  80. package/dist/research/agents/EncyclopediaAgent.js.map +1 -0
  81. package/dist/research/agents/NewsSearchAgent.d.ts +38 -0
  82. package/dist/research/agents/NewsSearchAgent.d.ts.map +1 -0
  83. package/dist/research/agents/NewsSearchAgent.js +146 -0
  84. package/dist/research/agents/NewsSearchAgent.js.map +1 -0
  85. package/dist/research/agents/WebSearchAgent.d.ts +45 -0
  86. package/dist/research/agents/WebSearchAgent.d.ts.map +1 -0
  87. package/dist/research/agents/WebSearchAgent.js +135 -0
  88. package/dist/research/agents/WebSearchAgent.js.map +1 -0
  89. package/dist/research/agents/index.d.ts +13 -0
  90. package/dist/research/agents/index.d.ts.map +1 -0
  91. package/dist/research/agents/index.js +12 -0
  92. package/dist/research/agents/index.js.map +1 -0
  93. package/dist/research/agents/types.d.ts +60 -0
  94. package/dist/research/agents/types.d.ts.map +1 -0
  95. package/dist/research/agents/types.js +9 -0
  96. package/dist/research/agents/types.js.map +1 -0
  97. package/dist/research/index.d.ts +16 -0
  98. package/dist/research/index.d.ts.map +1 -0
  99. package/dist/research/index.js +17 -0
  100. package/dist/research/index.js.map +1 -0
  101. package/dist/research/types.d.ts +206 -0
  102. package/dist/research/types.d.ts.map +1 -0
  103. package/dist/research/types.js +33 -0
  104. package/dist/research/types.js.map +1 -0
  105. package/package.json +1 -1
@@ -0,0 +1,362 @@
1
+ /**
2
+ * DOCXパーサー
3
+ *
4
+ * @design DES-COLLECT-003 §2.3
5
+ * @task TASK-001-4
6
+ */
7
+ import { ok, err } from '@nahisaho/katashiro-core';
8
+ import { DEFAULT_PARSE_OPTIONS } from '../types.js';
9
+ /**
10
+ * DOCXパーサー実装
11
+ *
12
+ * @example
13
+ * ```typescript
14
+ * const parser = new DOCXParser();
15
+ * const result = await parser.parse('./document.docx');
16
+ * ```
17
+ */
18
+ export class DOCXParser {
19
+ mammoth = null;
20
+ async loadMammoth() {
21
+ if (!this.mammoth) {
22
+ try {
23
+ // @ts-expect-error: mammoth is optional dependency
24
+ this.mammoth = (await import('mammoth'));
25
+ }
26
+ catch {
27
+ throw new Error('mammoth is not installed. Run: npm install mammoth');
28
+ }
29
+ }
30
+ return this.mammoth;
31
+ }
32
+ async parse(filePath, options = {}) {
33
+ const fs = await import('fs/promises');
34
+ const path = await import('path');
35
+ try {
36
+ const buffer = await fs.readFile(filePath);
37
+ const filename = path.basename(filePath);
38
+ return this.parseBuffer(buffer, filename, options);
39
+ }
40
+ catch (error) {
41
+ if (error.code === 'ENOENT') {
42
+ return err({
43
+ code: 'FILE_NOT_FOUND',
44
+ message: `File not found: ${filePath}`,
45
+ filePath,
46
+ });
47
+ }
48
+ if (error.code === 'EACCES') {
49
+ return err({
50
+ code: 'PERMISSION_DENIED',
51
+ message: `Permission denied: ${filePath}`,
52
+ filePath,
53
+ });
54
+ }
55
+ return err({
56
+ code: 'PARSE_ERROR',
57
+ message: `Failed to read file: ${error.message}`,
58
+ filePath,
59
+ details: error,
60
+ });
61
+ }
62
+ }
63
+ async parseBuffer(buffer, filename, options = {}) {
64
+ const mergedOptions = { ...DEFAULT_PARSE_OPTIONS, ...options };
65
+ // サイズチェック
66
+ if (buffer.length > mergedOptions.maxFileSize) {
67
+ return err({
68
+ code: 'FILE_TOO_LARGE',
69
+ message: `File size ${buffer.length} exceeds maximum ${mergedOptions.maxFileSize}`,
70
+ });
71
+ }
72
+ try {
73
+ const mammoth = await this.loadMammoth();
74
+ // テキスト抽出
75
+ const textResult = await mammoth.extractRawText({ buffer });
76
+ const content = textResult.value;
77
+ // HTML変換(構造抽出用)
78
+ const htmlResult = await mammoth.convertToHtml({ buffer });
79
+ const html = htmlResult.value;
80
+ const structure = this.extractStructure(content, html);
81
+ const metadata = this.extractMetadata(filename, buffer.length, content);
82
+ const result = {
83
+ content,
84
+ structure,
85
+ metadata,
86
+ };
87
+ // テーブル抽出
88
+ if (mergedOptions.extractTables) {
89
+ result.tables = this.extractTables(html);
90
+ }
91
+ // 画像抽出
92
+ if (mergedOptions.extractImages) {
93
+ result.images = this.extractImages(html);
94
+ }
95
+ return ok(result);
96
+ }
97
+ catch (error) {
98
+ const errorMessage = error.message || 'Unknown error';
99
+ if (errorMessage.includes('password') || errorMessage.includes('encrypted')) {
100
+ return err({
101
+ code: 'PASSWORD_PROTECTED',
102
+ message: 'DOCX is password protected',
103
+ details: error,
104
+ });
105
+ }
106
+ if (errorMessage.includes('Invalid') ||
107
+ errorMessage.includes('corrupt') ||
108
+ errorMessage.includes('not a valid')) {
109
+ return err({
110
+ code: 'CORRUPTED_FILE',
111
+ message: 'DOCX file is corrupted or invalid',
112
+ details: error,
113
+ });
114
+ }
115
+ return err({
116
+ code: 'PARSE_ERROR',
117
+ message: `Failed to parse DOCX: ${errorMessage}`,
118
+ details: error,
119
+ });
120
+ }
121
+ }
122
+ async parseStream(stream, filename, options = {}) {
123
+ const chunks = [];
124
+ return new Promise((resolve) => {
125
+ stream.on('data', (chunk) => chunks.push(Buffer.from(chunk)));
126
+ stream.on('error', (error) => {
127
+ resolve(err({
128
+ code: 'PARSE_ERROR',
129
+ message: `Stream read error: ${error.message}`,
130
+ }));
131
+ });
132
+ stream.on('end', async () => {
133
+ const buffer = Buffer.concat(chunks);
134
+ const result = await this.parseBuffer(buffer, filename, options);
135
+ resolve(result);
136
+ });
137
+ });
138
+ }
139
+ getSupportedFormats() {
140
+ return [
141
+ {
142
+ extension: '.docx',
143
+ mimeType: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
144
+ description: 'Word Document',
145
+ },
146
+ ];
147
+ }
148
+ isSupported(filename) {
149
+ return filename.toLowerCase().endsWith('.docx');
150
+ }
151
+ /**
152
+ * 構造を抽出
153
+ */
154
+ extractStructure(content, html) {
155
+ const headings = this.extractHeadings(html);
156
+ const paragraphs = this.extractParagraphs(content);
157
+ const sections = this.buildSections(content, headings);
158
+ return {
159
+ headings,
160
+ paragraphs,
161
+ sections,
162
+ };
163
+ }
164
+ /**
165
+ * HTMLから見出しを抽出
166
+ */
167
+ extractHeadings(html) {
168
+ const headings = [];
169
+ const headingRegex = /<h([1-6])[^>]*>(.*?)<\/h\1>/gi;
170
+ let match;
171
+ let position = 0;
172
+ while ((match = headingRegex.exec(html)) !== null) {
173
+ const level = parseInt(match[1], 10);
174
+ const text = this.stripHtml(match[2]);
175
+ headings.push({
176
+ level,
177
+ text,
178
+ position,
179
+ });
180
+ position++;
181
+ }
182
+ return headings;
183
+ }
184
+ /**
185
+ * 段落を抽出
186
+ */
187
+ extractParagraphs(content) {
188
+ const paragraphs = [];
189
+ const blocks = content.split(/\n\s*\n/);
190
+ let position = 0;
191
+ for (const block of blocks) {
192
+ const trimmed = block.trim();
193
+ if (trimmed.length > 0) {
194
+ paragraphs.push({
195
+ text: trimmed,
196
+ start: position,
197
+ end: position + trimmed.length,
198
+ });
199
+ }
200
+ position += block.length + 2;
201
+ }
202
+ return paragraphs;
203
+ }
204
+ /**
205
+ * セクションを構築
206
+ */
207
+ buildSections(content, headings) {
208
+ if (headings.length === 0) {
209
+ return [
210
+ {
211
+ title: 'Main Content',
212
+ content,
213
+ start: 0,
214
+ end: content.length,
215
+ },
216
+ ];
217
+ }
218
+ const sections = [];
219
+ const lines = content.split('\n');
220
+ for (let i = 0; i < headings.length; i++) {
221
+ const heading = headings[i];
222
+ const nextHeading = headings[i + 1];
223
+ // 見出しテキストを含む行を探す
224
+ let startLine = 0;
225
+ let endLine = lines.length;
226
+ for (let j = 0; j < lines.length; j++) {
227
+ if (lines[j].includes(heading.text)) {
228
+ startLine = j;
229
+ break;
230
+ }
231
+ }
232
+ if (nextHeading) {
233
+ for (let j = startLine + 1; j < lines.length; j++) {
234
+ if (lines[j].includes(nextHeading.text)) {
235
+ endLine = j;
236
+ break;
237
+ }
238
+ }
239
+ }
240
+ const sectionContent = lines.slice(startLine, endLine).join('\n');
241
+ const start = lines.slice(0, startLine).join('\n').length;
242
+ const end = start + sectionContent.length;
243
+ sections.push({
244
+ title: heading.text,
245
+ content: sectionContent,
246
+ start,
247
+ end,
248
+ });
249
+ }
250
+ return sections;
251
+ }
252
+ /**
253
+ * HTMLからテーブルを抽出
254
+ */
255
+ extractTables(html) {
256
+ const tables = [];
257
+ const tableRegex = /<table[^>]*>([\s\S]*?)<\/table>/gi;
258
+ let tableMatch;
259
+ let tableId = 0;
260
+ while ((tableMatch = tableRegex.exec(html)) !== null) {
261
+ const tableHtml = tableMatch[1];
262
+ const rows = [];
263
+ const rowRegex = /<tr[^>]*>([\s\S]*?)<\/tr>/gi;
264
+ let rowMatch;
265
+ while ((rowMatch = rowRegex.exec(tableHtml)) !== null) {
266
+ const rowHtml = rowMatch[1];
267
+ const cells = [];
268
+ const cellRegex = /<t[dh][^>]*>([\s\S]*?)<\/t[dh]>/gi;
269
+ let cellMatch;
270
+ while ((cellMatch = cellRegex.exec(rowHtml)) !== null) {
271
+ cells.push(this.stripHtml(cellMatch[1]));
272
+ }
273
+ if (cells.length > 0) {
274
+ rows.push(cells);
275
+ }
276
+ }
277
+ if (rows.length > 0) {
278
+ const [headerRow, ...dataRows] = rows;
279
+ tables.push({
280
+ id: `table-${tableId++}`,
281
+ headers: headerRow,
282
+ rows: dataRows.map((row) => ({
283
+ cells: row.map((value) => ({
284
+ value,
285
+ type: 'string',
286
+ })),
287
+ })),
288
+ position: tableMatch.index,
289
+ });
290
+ }
291
+ }
292
+ return tables;
293
+ }
294
+ /**
295
+ * HTMLから画像参照を抽出
296
+ */
297
+ extractImages(html) {
298
+ const images = [];
299
+ const imgRegex = /<img[^>]+src="([^"]*)"[^>]*(?:alt="([^"]*)")?[^>]*>/gi;
300
+ let match;
301
+ let imageId = 0;
302
+ while ((match = imgRegex.exec(html)) !== null) {
303
+ const src = match[1];
304
+ const alt = match[2];
305
+ // Base64画像の場合
306
+ const base64Match = src.match(/^data:([^;]+);base64,(.+)$/);
307
+ if (base64Match) {
308
+ images.push({
309
+ id: `image-${imageId++}`,
310
+ mimeType: base64Match[1],
311
+ altText: alt,
312
+ position: match.index,
313
+ data: base64Match[2],
314
+ });
315
+ }
316
+ else {
317
+ images.push({
318
+ id: `image-${imageId++}`,
319
+ filename: src,
320
+ mimeType: this.guessMimeType(src),
321
+ altText: alt,
322
+ position: match.index,
323
+ });
324
+ }
325
+ }
326
+ return images;
327
+ }
328
+ /**
329
+ * HTMLタグを除去
330
+ */
331
+ stripHtml(html) {
332
+ return html.replace(/<[^>]+>/g, '').trim();
333
+ }
334
+ /**
335
+ * メタデータを抽出
336
+ */
337
+ extractMetadata(filename, fileSize, content) {
338
+ return {
339
+ filename,
340
+ fileSize,
341
+ mimeType: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
342
+ characterCount: content.length,
343
+ wordCount: content.split(/\s+/).filter((w) => w.length > 0).length,
344
+ };
345
+ }
346
+ /**
347
+ * ファイル名からMIMEタイプを推測
348
+ */
349
+ guessMimeType(filename) {
350
+ const ext = filename.toLowerCase().split('.').pop();
351
+ const mimeTypes = {
352
+ png: 'image/png',
353
+ jpg: 'image/jpeg',
354
+ jpeg: 'image/jpeg',
355
+ gif: 'image/gif',
356
+ bmp: 'image/bmp',
357
+ svg: 'image/svg+xml',
358
+ };
359
+ return mimeTypes[ext || ''] || 'application/octet-stream';
360
+ }
361
+ }
362
+ //# sourceMappingURL=DOCXParser.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"DOCXParser.js","sourceRoot":"","sources":["../../../src/document/parsers/DOCXParser.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,EAAE,EAAE,GAAG,EAAe,MAAM,0BAA0B,CAAC;AAehE,OAAO,EAAE,qBAAqB,EAAE,MAAM,aAAa,CAAC;AAkBpD;;;;;;;;GAQG;AACH,MAAM,OAAO,UAAU;IACb,OAAO,GAAyB,IAAI,CAAC;IAErC,KAAK,CAAC,WAAW;QACvB,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC;YAClB,IAAI,CAAC;gBACH,mDAAmD;gBACnD,IAAI,CAAC,OAAO,GAAG,CAAC,MAAM,MAAM,CAAC,SAAS,CAAC,CAA6B,CAAC;YACvE,CAAC;YAAC,MAAM,CAAC;gBACP,MAAM,IAAI,KAAK,CAAC,oDAAoD,CAAC,CAAC;YACxE,CAAC;QACH,CAAC;QACD,OAAO,IAAI,CAAC,OAAO,CAAC;IACtB,CAAC;IAED,KAAK,CAAC,KAAK,CACT,QAAgB,EAChB,UAAwB,EAAE;QAE1B,MAAM,EAAE,GAAG,MAAM,MAAM,CAAC,aAAa,CAAC,CAAC;QACvC,MAAM,IAAI,GAAG,MAAM,MAAM,CAAC,MAAM,CAAC,CAAC;QAElC,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;YAC3C,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;YACzC,OAAO,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,QAAQ,EAAE,OAAO,CAAC,CAAC;QACrD,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,IAAK,KAA+B,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;gBACvD,OAAO,GAAG,CAAC;oBACT,IAAI,EAAE,gBAAgB;oBACtB,OAAO,EAAE,mBAAmB,QAAQ,EAAE;oBACtC,QAAQ;iBACT,CAAC,CAAC;YACL,CAAC;YACD,IAAK,KAA+B,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;gBACvD,OAAO,GAAG,CAAC;oBACT,IAAI,EAAE,mBAAmB;oBACzB,OAAO,EAAE,sBAAsB,QAAQ,EAAE;oBACzC,QAAQ;iBACT,CAAC,CAAC;YACL,CAAC;YACD,OAAO,GAAG,CAAC;gBACT,IAAI,EAAE,aAAa;gBACnB,OAAO,EAAE,wBAAyB,KAAe,CAAC,OAAO,EAAE;gBAC3D,QAAQ;gBACR,OAAO,EAAE,KAAK;aACf,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,KAAK,CAAC,WAAW,CACf,MAAc,EACd,QAAgB,EAChB,UAAwB,EAAE;QAE1B,MAAM,aAAa,GAAG,EAAE,GAAG,qBAAqB,EAAE,GAAG,OAAO,EAAE,CAAC;QAE/D,UAAU;QACV,IAAI,MAAM,CAAC,MAAM,GAAG,aAAa,CAAC,WAAW,EAAE,CAAC;YAC9C,OAAO,GAAG,CAAC;gBACT,IAAI,EAAE,gBAAgB;gBACtB,OAAO,EAAE,aAAa,MAAM,CAAC,MAAM,oBAAoB,aAAa,CAAC,WAAW,EAAE;aACnF,CAAC,CAAC;QACL,CAAC;QAED,IAAI,CAAC;YACH,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,WAAW,EAAE,CAAC;YAEzC,SAAS;YACT,MAAM,UAAU,GAAG,MAAM,OAAO,CAAC,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;YAC5D,MAAM,OAAO,GAAG,UAAU,CAAC,KAAK,CAAC;YAEjC,gBAAgB;YAChB,MAAM,UAAU,GAAG,MAAM,OAAO,CAAC,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;YAC3D,MAAM,IAAI,GAAG,UAAU,CAAC,KAAK,CAAC;YAE9B,MAAM,SAAS,GAAG,IAAI,CAAC,gBAAgB,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC;YACvD,MAAM,QAAQ,GAAG,IAAI,CAAC,eAAe,CAAC,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;YAExE,MAAM,MAAM,GAAmB;gBAC7B,OAAO;gBACP,SAAS;gBACT,QAAQ;aACT,CAAC;YAEF,SAAS;YACT,IAAI,aAAa,CAAC,aAAa,EAAE,CAAC;gBAChC,MAAM,CAAC,MAAM,GAAG,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,CAAC;YAC3C,CAAC;YAED,OAAO;YACP,IAAI,aAAa,CAAC,aAAa,EAAE,CAAC;gBAChC,MAAM,CAAC,MAAM,GAAG,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,CAAC;YAC3C,CAAC;YAED,OAAO,EAAE,CAAC,MAAM,CAAC,CAAC;QACpB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,YAAY,GAAI,KAAe,CAAC,OAAO,IAAI,eAAe,CAAC;YAEjE,IAAI,YAAY,CAAC,QAAQ,CAAC,UAAU,CAAC,IAAI,YAAY,CAAC,QAAQ,CAAC,WAAW,CAAC,EAAE,CAAC;gBAC5E,OAAO,GAAG,CAAC;oBACT,IAAI,EAAE,oBAAoB;oBAC1B,OAAO,EAAE,4BAA4B;oBACrC,OAAO,EAAE,KAAK;iBACf,CAAC,CAAC;YACL,CAAC;YAED,IACE,YAAY,CAAC,QAAQ,CAAC,SAAS,CAAC;gBAChC,YAAY,CAAC,QAAQ,CAAC,SAAS,CAAC;gBAChC,YAAY,CAAC,QAAQ,CAAC,aAAa,CAAC,EACpC,CAAC;gBACD,OAAO,GAAG,CAAC;oBACT,IAAI,EAAE,gBAAgB;oBACtB,OAAO,EAAE,mCAAmC;oBAC5C,OAAO,EAAE,KAAK;iBACf,CAAC,CAAC;YACL,CAAC;YAED,OAAO,GAAG,CAAC;gBACT,IAAI,EAAE,aAAa;gBACnB,OAAO,EAAE,yBAAyB,YAAY,EAAE;gBAChD,OAAO,EAAE,KAAK;aACf,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,KAAK,CAAC,WAAW,CACf,MAA6B,EAC7B,QAAgB,EAChB,UAAwB,EAAE;QAE1B,MAAM,MAAM,GAAa,EAAE,CAAC;QAE5B,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE;YAC7B,MAAM,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;YAC9D,MAAM,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,KAAK,EAAE,EAAE;gBAC3B,OAAO,CACL,GAAG,CAAC;oBACF,IAAI,EAAE,aAAa;oBACnB,OAAO,EAAE,sBAAsB,KAAK,CAAC,OAAO,EAAE;iBAC/C,CAAC,CACH,CAAC;YACJ,CAAC,CAAC,CAAC;YACH,MAAM,CAAC,EAAE,CAAC,KAAK,EAAE,KAAK,IAAI,EAAE;gBAC1B,MAAM,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;gBACrC,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,QAAQ,EAAE,OAAO,CAAC,CAAC;gBACjE,OAAO,CAAC,MAAM,CAAC,CAAC;YAClB,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;IACL,CAAC;IAED,mBAAmB;QACjB,OAAO;YACL;gBACE,SAAS,EAAE,OAAO;gBAClB,QAAQ,EAAE,yEAAyE;gBACnF,WAAW,EAAE,eAAe;aAC7B;SACF,CAAC;IACJ,CAAC;IAED,WAAW,CAAC,QAAgB;QAC1B,OAAO,QAAQ,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;IAClD,CAAC;IAED;;OAEG;IACK,gBAAgB,CAAC,OAAe,EAAE,IAAY;QACpD,MAAM,QAAQ,GAAG,IAAI,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC;QAC5C,MAAM,UAAU,GAAG,IAAI,CAAC,iBAAiB,CAAC,OAAO,CAAC,CAAC;QACnD,MAAM,QAAQ,GAAG,IAAI,CAAC,aAAa,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAC;QAEvD,OAAO;YACL,QAAQ;YACR,UAAU;YACV,QAAQ;SACT,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,eAAe,CAAC,IAAY;QAClC,MAAM,QAAQ,GAAc,EAAE,CAAC;QAC/B,MAAM,YAAY,GAAG,+BAA+B,CAAC;QACrD,IAAI,KAAK,CAAC;QACV,IAAI,QAAQ,GAAG,CAAC,CAAC;QAEjB,OAAO,CAAC,KAAK,GAAG,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YAClD,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAE,EAAE,EAAE,CAAC,CAAC;YACtC,MAAM,IAAI,GAAG,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,CAAE,CAAC,CAAC;YAEvC,QAAQ,CAAC,IAAI,CAAC;gBACZ,KAAK;gBACL,IAAI;gBACJ,QAAQ;aACT,CAAC,CAAC;YACH,QAAQ,EAAE,CAAC;QACb,CAAC;QAED,OAAO,QAAQ,CAAC;IAClB,CAAC;IAED;;OAEG;IACK,iBAAiB,CAAC,OAAe;QACvC,MAAM,UAAU,GAAgB,EAAE,CAAC;QACnC,MAAM,MAAM,GAAG,OAAO,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;QACxC,IAAI,QAAQ,GAAG,CAAC,CAAC;QAEjB,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;YAC3B,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,EAAE,CAAC;YAC7B,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACvB,UAAU,CAAC,IAAI,CAAC;oBACd,IAAI,EAAE,OAAO;oBACb,KAAK,EAAE,QAAQ;oBACf,GAAG,EAAE,QAAQ,GAAG,OAAO,CAAC,MAAM;iBAC/B,CAAC,CAAC;YACL,CAAC;YACD,QAAQ,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC;QAC/B,CAAC;QAED,OAAO,UAAU,CAAC;IACpB,CAAC;IAED;;OAEG;IACK,aAAa,CAAC,OAAe,EAAE,QAAmB;QACxD,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC1B,OAAO;gBACL;oBACE,KAAK,EAAE,cAAc;oBACrB,OAAO;oBACP,KAAK,EAAE,CAAC;oBACR,GAAG,EAAE,OAAO,CAAC,MAAM;iBACpB;aACF,CAAC;QACJ,CAAC;QAED,MAAM,QAAQ,GAAc,EAAE,CAAC;QAC/B,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAElC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACzC,MAAM,OAAO,GAAG,QAAQ,CAAC,CAAC,CAAE,CAAC;YAC7B,MAAM,WAAW,GAAG,QAAQ,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;YAEpC,iBAAiB;YACjB,IAAI,SAAS,GAAG,CAAC,CAAC;YAClB,IAAI,OAAO,GAAG,KAAK,CAAC,MAAM,CAAC;YAE3B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACtC,IAAI,KAAK,CAAC,CAAC,CAAE,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC;oBACrC,SAAS,GAAG,CAAC,CAAC;oBACd,MAAM;gBACR,CAAC;YACH,CAAC;YAED,IAAI,WAAW,EAAE,CAAC;gBAChB,KAAK,IAAI,CAAC,GAAG,SAAS,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;oBAClD,IAAI,KAAK,CAAC,CAAC,CAAE,CAAC,QAAQ,CAAC,WAAW,CAAC,IAAI,CAAC,EAAE,CAAC;wBACzC,OAAO,GAAG,CAAC,CAAC;wBACZ,MAAM;oBACR,CAAC;gBACH,CAAC;YACH,CAAC;YAED,MAAM,cAAc,GAAG,KAAK,CAAC,KAAK,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAClE,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC;YAC1D,MAAM,GAAG,GAAG,KAAK,GAAG,cAAc,CAAC,MAAM,CAAC;YAE1C,QAAQ,CAAC,IAAI,CAAC;gBACZ,KAAK,EAAE,OAAO,CAAC,IAAI;gBACnB,OAAO,EAAE,cAAc;gBACvB,KAAK;gBACL,GAAG;aACJ,CAAC,CAAC;QACL,CAAC;QAED,OAAO,QAAQ,CAAC;IAClB,CAAC;IAED;;OAEG;IACK,aAAa,CAAC,IAAY;QAChC,MAAM,MAAM,GAAgB,EAAE,CAAC;QAC/B,MAAM,UAAU,GAAG,mCAAmC,CAAC;QACvD,IAAI,UAAU,CAAC;QACf,IAAI,OAAO,GAAG,CAAC,CAAC;QAEhB,OAAO,CAAC,UAAU,GAAG,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YACrD,MAAM,SAAS,GAAG,UAAU,CAAC,CAAC,CAAE,CAAC;YACjC,MAAM,IAAI,GAAe,EAAE,CAAC;YAE5B,MAAM,QAAQ,GAAG,6BAA6B,CAAC;YAC/C,IAAI,QAAQ,CAAC;YAEb,OAAO,CAAC,QAAQ,GAAG,QAAQ,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;gBACtD,MAAM,OAAO,GAAG,QAAQ,CAAC,CAAC,CAAE,CAAC;gBAC7B,MAAM,KAAK,GAAa,EAAE,CAAC;gBAE3B,MAAM,SAAS,GAAG,mCAAmC,CAAC;gBACtD,IAAI,SAAS,CAAC;gBAEd,OAAO,CAAC,SAAS,GAAG,SAAS,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;oBACtD,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC,CAAE,CAAC,CAAC,CAAC;gBAC5C,CAAC;gBAED,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBACrB,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;gBACnB,CAAC;YACH,CAAC;YAED,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACpB,MAAM,CAAC,SAAS,EAAE,GAAG,QAAQ,CAAC,GAAG,IAAI,CAAC;gBAEtC,MAAM,CAAC,IAAI,CAAC;oBACV,EAAE,EAAE,SAAS,OAAO,EAAE,EAAE;oBACxB,OAAO,EAAE,SAAS;oBAClB,IAAI,EAAE,QAAQ,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC;wBAC3B,KAAK,EAAE,GAAG,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;4BACzB,KAAK;4BACL,IAAI,EAAE,QAAiB;yBACxB,CAAC,CAAC;qBACJ,CAAC,CAAC;oBACH,QAAQ,EAAE,UAAU,CAAC,KAAK;iBAC3B,CAAC,CAAC;YACL,CAAC;QACH,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACK,aAAa,CAAC,IAAY;QAChC,MAAM,MAAM,GAAqB,EAAE,CAAC;QACpC,MAAM,QAAQ,GAAG,uDAAuD,CAAC;QACzE,IAAI,KAAK,CAAC;QACV,IAAI,OAAO,GAAG,CAAC,CAAC;QAEhB,OAAO,CAAC,KAAK,GAAG,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YAC9C,MAAM,GAAG,GAAG,KAAK,CAAC,CAAC,CAAE,CAAC;YACtB,MAAM,GAAG,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;YAErB,cAAc;YACd,MAAM,WAAW,GAAG,GAAG,CAAC,KAAK,CAAC,4BAA4B,CAAC,CAAC;YAC5D,IAAI,WAAW,EAAE,CAAC;gBAChB,MAAM,CAAC,IAAI,CAAC;oBACV,EAAE,EAAE,SAAS,OAAO,EAAE,EAAE;oBACxB,QAAQ,EAAE,WAAW,CAAC,CAAC,CAAE;oBACzB,OAAO,EAAE,GAAG;oBACZ,QAAQ,EAAE,KAAK,CAAC,KAAK;oBACrB,IAAI,EAAE,WAAW,CAAC,CAAC,CAAC;iBACrB,CAAC,CAAC;YACL,CAAC;iBAAM,CAAC;gBACN,MAAM,CAAC,IAAI,CAAC;oBACV,EAAE,EAAE,SAAS,OAAO,EAAE,EAAE;oBACxB,QAAQ,EAAE,GAAG;oBACb,QAAQ,EAAE,IAAI,CAAC,aAAa,CAAC,GAAG,CAAC;oBACjC,OAAO,EAAE,GAAG;oBACZ,QAAQ,EAAE,KAAK,CAAC,KAAK;iBACtB,CAAC,CAAC;YACL,CAAC;QACH,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACK,SAAS,CAAC,IAAY;QAC5B,OAAO,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;IAC7C,CAAC;IAED;;OAEG;IACK,eAAe,CACrB,QAAgB,EAChB,QAAgB,EAChB,OAAe;QAEf,OAAO;YACL,QAAQ;YACR,QAAQ;YACR,QAAQ,EAAE,yEAAyE;YACnF,cAAc,EAAE,OAAO,CAAC,MAAM;YAC9B,SAAS,EAAE,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,MAAM;SACnE,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,aAAa,CAAC,QAAgB;QACpC,MAAM,GAAG,GAAG,QAAQ,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,CAAC;QACpD,MAAM,SAAS,GAA2B;YACxC,GAAG,EAAE,WAAW;YAChB,GAAG,EAAE,YAAY;YACjB,IAAI,EAAE,YAAY;YAClB,GAAG,EAAE,WAAW;YAChB,GAAG,EAAE,WAAW;YAChB,GAAG,EAAE,eAAe;SACrB,CAAC;QACF,OAAO,SAAS,CAAC,GAAG,IAAI,EAAE,CAAC,IAAI,0BAA0B,CAAC;IAC5D,CAAC;CACF"}
@@ -0,0 +1,60 @@
1
+ /**
2
+ * PDFパーサー
3
+ *
4
+ * @design DES-COLLECT-003 §2.3
5
+ * @task TASK-001-3
6
+ */
7
+ import { type Result } from '@nahisaho/katashiro-core';
8
+ import type { IDocumentParser, ParsedDocument, DocumentError, ParseOptions, SupportedFormat } from '../types.js';
9
+ /**
10
+ * PDFパーサー実装
11
+ *
12
+ * @example
13
+ * ```typescript
14
+ * const parser = new PDFParser();
15
+ * const result = await parser.parse('./document.pdf');
16
+ * ```
17
+ */
18
+ export declare class PDFParser implements IDocumentParser {
19
+ private pdfParseModule;
20
+ private loadPdfParse;
21
+ parse(filePath: string, options?: ParseOptions): Promise<Result<ParsedDocument, DocumentError>>;
22
+ parseBuffer(buffer: Buffer, filename: string, options?: ParseOptions): Promise<Result<ParsedDocument, DocumentError>>;
23
+ parseStream(stream: NodeJS.ReadableStream, filename: string, options?: ParseOptions): Promise<Result<ParsedDocument, DocumentError>>;
24
+ getSupportedFormats(): SupportedFormat[];
25
+ isSupported(filename: string): boolean;
26
+ /**
27
+ * テキストから構造を抽出
28
+ */
29
+ private extractStructure;
30
+ /**
31
+ * 見出しを抽出(ヒューリスティック)
32
+ */
33
+ private extractHeadings;
34
+ /**
35
+ * 段落を抽出
36
+ */
37
+ private extractParagraphs;
38
+ /**
39
+ * セクションを構築
40
+ */
41
+ private buildSections;
42
+ /**
43
+ * メタデータを抽出
44
+ */
45
+ private extractMetadata;
46
+ /**
47
+ * PDFの日付形式をパース
48
+ */
49
+ private parsePdfDate;
50
+ /**
51
+ * ページ情報を抽出
52
+ */
53
+ private extractPages;
54
+ /**
55
+ * テーブルを抽出(簡易実装)
56
+ */
57
+ private extractTables;
58
+ private buildTableData;
59
+ }
60
+ //# sourceMappingURL=PDFParser.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"PDFParser.d.ts","sourceRoot":"","sources":["../../../src/document/parsers/PDFParser.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAW,KAAK,MAAM,EAAE,MAAM,0BAA0B,CAAC;AAChE,OAAO,KAAK,EACV,eAAe,EACf,cAAc,EACd,aAAa,EACb,YAAY,EACZ,eAAe,EAQhB,MAAM,aAAa,CAAC;AAsBrB;;;;;;;;GAQG;AACH,qBAAa,SAAU,YAAW,eAAe;IAC/C,OAAO,CAAC,cAAc,CAAuD;YAE/D,YAAY;IAapB,KAAK,CACT,QAAQ,EAAE,MAAM,EAChB,OAAO,GAAE,YAAiB,GACzB,OAAO,CAAC,MAAM,CAAC,cAAc,EAAE,aAAa,CAAC,CAAC;IAgC3C,WAAW,CACf,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,MAAM,EAChB,OAAO,GAAE,YAAiB,GACzB,OAAO,CAAC,MAAM,CAAC,cAAc,EAAE,aAAa,CAAC,CAAC;IA4D3C,WAAW,CACf,MAAM,EAAE,MAAM,CAAC,cAAc,EAC7B,QAAQ,EAAE,MAAM,EAChB,OAAO,GAAE,YAAiB,GACzB,OAAO,CAAC,MAAM,CAAC,cAAc,EAAE,aAAa,CAAC,CAAC;IAqBjD,mBAAmB,IAAI,eAAe,EAAE;IAIxC,WAAW,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO;IAItC;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAYxB;;OAEG;IACH,OAAO,CAAC,eAAe;IAuCvB;;OAEG;IACH,OAAO,CAAC,iBAAiB;IAoBzB;;OAEG;IACH,OAAO,CAAC,aAAa;IA8BrB;;OAEG;IACH,OAAO,CAAC,eAAe;IAuBvB;;OAEG;IACH,OAAO,CAAC,YAAY;IAqBpB;;OAEG;IACH,OAAO,CAAC,YAAY;IAoBpB;;OAEG;IACH,OAAO,CAAC,aAAa;IAoCrB,OAAO,CAAC,cAAc;CAevB"}