elaws-parser 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- elaws_parser/__init__.py +19 -0
- elaws_parser/hourei_apiv2.py +128 -0
- elaws_parser/law_extraction.py +484 -0
- elaws_parser/law_extraction_v2.py +739 -0
- elaws_parser/text_converter.py +407 -0
- elaws_parser/yaml_converter.py +727 -0
- elaws_parser-0.1.0.dist-info/METADATA +149 -0
- elaws_parser-0.1.0.dist-info/RECORD +10 -0
- elaws_parser-0.1.0.dist-info/WHEEL +5 -0
- elaws_parser-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,739 @@
|
|
|
1
|
+
"""
|
|
2
|
+
YAMLパーサを利用する場合のlanggraphコード
|
|
3
|
+
|
|
4
|
+
# TODO :: law_extraction.pyとかぶっているGraphBuilderなどのリファクタリング
|
|
5
|
+
# TODO :: YAMLArticleExtractorとRegulationExtractorも共通部分が多いのでまとめられないか検討
|
|
6
|
+
# TODO :: YamlArticleExtractorはyaml_converterと密接に繋がっているので,場所を移動する.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
from abc import ABC, abstractmethod
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from enum import Enum
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any, Dict, List, Literal, Optional, TypedDict
|
|
15
|
+
|
|
16
|
+
import yaml
|
|
17
|
+
from langchain_core.language_models import BaseLLM
|
|
18
|
+
from langchain_core.messages import BaseMessage, SystemMessage
|
|
19
|
+
from langchain_core.prompts import PromptTemplate
|
|
20
|
+
from langchain_openai import ChatOpenAI
|
|
21
|
+
from langgraph.graph import END, StateGraph # CompiledGraph
|
|
22
|
+
from pydantic import BaseModel, Field
|
|
23
|
+
|
|
24
|
+
from .law_extraction import ( # RegulationExtractor,
|
|
25
|
+
BaseExtractor,
|
|
26
|
+
ExtractionResult,
|
|
27
|
+
GraphState,
|
|
28
|
+
LegalDocument,
|
|
29
|
+
ProcessingStage,
|
|
30
|
+
PromptManager,
|
|
31
|
+
ViewpointGenerator,
|
|
32
|
+
flatten_state,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
logger = logging.getLogger(__name__)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class RelevantArticles(BaseModel):
|
|
39
|
+
"""関連条文のstructured output用Pydanticモデル"""
|
|
40
|
+
|
|
41
|
+
article_numbers: List[str] = Field(
|
|
42
|
+
description="関連する条文番号のリスト(例: ['3', '4', '20_2', '62'])"
|
|
43
|
+
)
|
|
44
|
+
extraction_reasoning: str = Field(
|
|
45
|
+
description="これらの条文を選択した理由の簡潔な説明"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class ExtractedArticleContent:
|
|
51
|
+
"""抽出された条文内容"""
|
|
52
|
+
|
|
53
|
+
article_num: str
|
|
54
|
+
title: str
|
|
55
|
+
full_content: str
|
|
56
|
+
found: bool = True
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class YamlArticleExtractor:
|
|
60
|
+
"""YAML構造から条文を抽出するクラス"""
|
|
61
|
+
|
|
62
|
+
def __init__(self, yaml_data: Dict[str, Any]):
|
|
63
|
+
"""
|
|
64
|
+
Args:
|
|
65
|
+
yaml_data: 法令のYAMLデータ辞書
|
|
66
|
+
"""
|
|
67
|
+
self.yaml_data = yaml_data
|
|
68
|
+
|
|
69
|
+
def extract_articles_by_numbers(
|
|
70
|
+
self, article_numbers: List[str]
|
|
71
|
+
) -> List[ExtractedArticleContent]:
|
|
72
|
+
"""指定された条文番号リストから条文内容を抽出
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
article_numbers: 抽出したい条文番号のリスト
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
抽出された条文内容のリスト
|
|
79
|
+
"""
|
|
80
|
+
extracted_articles = []
|
|
81
|
+
|
|
82
|
+
for article_num in article_numbers: # 条文でループ
|
|
83
|
+
try:
|
|
84
|
+
article_content = self._find_and_extract_article(article_num)
|
|
85
|
+
extracted_articles.append(article_content)
|
|
86
|
+
except Exception as e:
|
|
87
|
+
logger.error(f"条文{article_num}の抽出でエラー: {e}")
|
|
88
|
+
# 見つからない場合でもエラー情報を含めて追加
|
|
89
|
+
extracted_articles.append(
|
|
90
|
+
ExtractedArticleContent(
|
|
91
|
+
article_num=article_num,
|
|
92
|
+
title="",
|
|
93
|
+
full_content=f"第{article_num}条の内容を取得できませんでした",
|
|
94
|
+
found=False,
|
|
95
|
+
)
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
return extracted_articles
|
|
99
|
+
|
|
100
|
+
def _find_and_extract_article(self, article_num: str) -> ExtractedArticleContent:
|
|
101
|
+
"""指定された条文番号の条文を直接検索・抽出
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
article_num: 条文番号
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
抽出された条文内容
|
|
108
|
+
"""
|
|
109
|
+
# YAML構造を直接検索
|
|
110
|
+
target_article = self._search_article_in_yaml(article_num)
|
|
111
|
+
|
|
112
|
+
if not target_article:
|
|
113
|
+
raise ValueError(f"第{article_num}条が見つかりません")
|
|
114
|
+
|
|
115
|
+
# 条文の完全な内容を抽出
|
|
116
|
+
title = target_article.get("title", "")
|
|
117
|
+
full_content = self._extract_full_article_text(target_article)
|
|
118
|
+
|
|
119
|
+
return ExtractedArticleContent(
|
|
120
|
+
article_num=article_num, title=title, full_content=full_content, found=True
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
def _search_article_in_yaml(self, article_num: str) -> Optional[Dict[str, Any]]:
|
|
124
|
+
"""YAML構造から指定された条文番号を直接検索
|
|
125
|
+
# TODO :: もう少し綺麗に実装する.三つあるのがやばい.
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
# part構造(一部の大規模法令)
|
|
129
|
+
if "parts" in self.yaml_data:
|
|
130
|
+
print("Part 構造です")
|
|
131
|
+
for part in self.yaml_data["parts"]:
|
|
132
|
+
if "chapters" in part:
|
|
133
|
+
for chapter in part["chapters"]:
|
|
134
|
+
# 章直下の条文を検索
|
|
135
|
+
if "articles" in chapter:
|
|
136
|
+
for article in chapter["articles"]:
|
|
137
|
+
if article.get("article_num") == article_num:
|
|
138
|
+
return article
|
|
139
|
+
|
|
140
|
+
# 節の下の条文を検索
|
|
141
|
+
if "sections" in chapter:
|
|
142
|
+
for section in chapter["sections"]:
|
|
143
|
+
# section直下がsubsectionの場合
|
|
144
|
+
if "subsections" in section:
|
|
145
|
+
for subsection in section["subsections"]:
|
|
146
|
+
if "articles" in subsection:
|
|
147
|
+
for article in subsection["articles"]:
|
|
148
|
+
if (
|
|
149
|
+
article.get("article_num")
|
|
150
|
+
== article_num
|
|
151
|
+
):
|
|
152
|
+
return article
|
|
153
|
+
|
|
154
|
+
# section直下がarticleの場合
|
|
155
|
+
if "articles" in section:
|
|
156
|
+
for article in section["articles"]:
|
|
157
|
+
if article.get("article_num") == article_num:
|
|
158
|
+
return article
|
|
159
|
+
|
|
160
|
+
# chapters構造(一般的な法令)
|
|
161
|
+
elif "chapters" in self.yaml_data:
|
|
162
|
+
print("Chapter 構造です")
|
|
163
|
+
for chapter in self.yaml_data["chapters"]:
|
|
164
|
+
# 章直下の条文を検索
|
|
165
|
+
if "articles" in chapter:
|
|
166
|
+
for article in chapter["articles"]:
|
|
167
|
+
if article.get("article_num") == article_num:
|
|
168
|
+
return article
|
|
169
|
+
|
|
170
|
+
# 節の下の条文を検索
|
|
171
|
+
if "sections" in chapter:
|
|
172
|
+
for section in chapter["sections"]:
|
|
173
|
+
# section直下がsubsectionの場合
|
|
174
|
+
if "subsections" in section:
|
|
175
|
+
for subsection in section["subsections"]:
|
|
176
|
+
# section直下がarticleの場合
|
|
177
|
+
if "articles" in subsection:
|
|
178
|
+
for article in subsection["articles"]:
|
|
179
|
+
if article.get("article_num") == article_num:
|
|
180
|
+
return article
|
|
181
|
+
|
|
182
|
+
# section直下がarticleの場合
|
|
183
|
+
if "articles" in section:
|
|
184
|
+
for article in section["articles"]:
|
|
185
|
+
if article.get("article_num") == article_num:
|
|
186
|
+
return article
|
|
187
|
+
|
|
188
|
+
# articles構造(施行規則等)
|
|
189
|
+
elif "articles" in self.yaml_data:
|
|
190
|
+
print("article 構造です")
|
|
191
|
+
for article in self.yaml_data["articles"]:
|
|
192
|
+
if article.get("article_num") == article_num:
|
|
193
|
+
return article
|
|
194
|
+
|
|
195
|
+
logger.warning("yaml構造にparts,chapters,articlesが存在しません.")
|
|
196
|
+
return None
|
|
197
|
+
|
|
198
|
+
def _extract_full_article_text(self, article: Dict[str, Any]) -> str:
|
|
199
|
+
"""条文の完全なテキストを抽出
|
|
200
|
+
# TODO :: ここもどのようにテキスト化するか,論点になる.
|
|
201
|
+
# TODO :: もっというと,xml->yaml->textとしてるので非常に効率が悪い.
|
|
202
|
+
"""
|
|
203
|
+
content_parts = []
|
|
204
|
+
|
|
205
|
+
# 条文タイトル
|
|
206
|
+
title = article.get("title", "")
|
|
207
|
+
caption = article.get("caption", "")
|
|
208
|
+
article_num = article.get("article_num", "?")
|
|
209
|
+
|
|
210
|
+
# 条文ヘッダー
|
|
211
|
+
if caption:
|
|
212
|
+
header = f"第{article_num}条({caption})"
|
|
213
|
+
else:
|
|
214
|
+
header = f"第{article_num}条"
|
|
215
|
+
|
|
216
|
+
if title:
|
|
217
|
+
header += f" {title}"
|
|
218
|
+
|
|
219
|
+
content_parts.append(header)
|
|
220
|
+
|
|
221
|
+
# 各項を処理
|
|
222
|
+
paragraphs = article.get("paragraphs", [])
|
|
223
|
+
for i, paragraph in enumerate(paragraphs):
|
|
224
|
+
paragraph_content = self._extract_paragraph_text(paragraph, i + 1)
|
|
225
|
+
if paragraph_content:
|
|
226
|
+
content_parts.append(paragraph_content)
|
|
227
|
+
|
|
228
|
+
return "\n".join(content_parts)
|
|
229
|
+
|
|
230
|
+
def _extract_paragraph_text(
|
|
231
|
+
self, paragraph: Dict[str, Any], paragraph_index: int
|
|
232
|
+
) -> str:
|
|
233
|
+
"""項のテキストを抽出"""
|
|
234
|
+
parts = []
|
|
235
|
+
|
|
236
|
+
# 項番号(明示的にない場合は項番号を付与)
|
|
237
|
+
paragraph_num = paragraph.get("paragraph_num", str(paragraph_index))
|
|
238
|
+
if (
|
|
239
|
+
paragraph_num and paragraph_num != "1"
|
|
240
|
+
): # 第1項の場合は番号を省略することが多い
|
|
241
|
+
parts.append(f"(第{paragraph_num}項)")
|
|
242
|
+
|
|
243
|
+
# 項の本文
|
|
244
|
+
content = paragraph.get("content", "")
|
|
245
|
+
if content:
|
|
246
|
+
parts.append(content)
|
|
247
|
+
|
|
248
|
+
# 号がある場合
|
|
249
|
+
items = paragraph.get("items", [])
|
|
250
|
+
for item in items:
|
|
251
|
+
item_text = self._extract_item_text(item)
|
|
252
|
+
if item_text:
|
|
253
|
+
parts.append(f" {item_text}")
|
|
254
|
+
|
|
255
|
+
# 表がある場合
|
|
256
|
+
if "table" in paragraph:
|
|
257
|
+
table_text = self._extract_table_text(paragraph["table"])
|
|
258
|
+
if table_text:
|
|
259
|
+
parts.append(f"【表】\n{table_text}")
|
|
260
|
+
|
|
261
|
+
return "\n".join(parts) if parts else ""
|
|
262
|
+
|
|
263
|
+
def _extract_item_text(self, item: Dict[str, Any]) -> str:
|
|
264
|
+
"""号のテキストを抽出"""
|
|
265
|
+
parts = []
|
|
266
|
+
|
|
267
|
+
# 号番号とタイトル
|
|
268
|
+
title = item.get("title", "")
|
|
269
|
+
content = item.get("content", "")
|
|
270
|
+
|
|
271
|
+
if title and content:
|
|
272
|
+
parts.append(f"{title} {content}")
|
|
273
|
+
elif content:
|
|
274
|
+
parts.append(content)
|
|
275
|
+
|
|
276
|
+
# サブ項目がある場合(イロハなど)
|
|
277
|
+
subitems = item.get("subitems", [])
|
|
278
|
+
for subitem in subitems:
|
|
279
|
+
subitem_text = self._extract_subitem_text(subitem)
|
|
280
|
+
if subitem_text:
|
|
281
|
+
parts.append(f" {subitem_text}")
|
|
282
|
+
|
|
283
|
+
return "\n".join(parts) if parts else ""
|
|
284
|
+
|
|
285
|
+
def _extract_subitem_text(self, subitem: Dict[str, Any]) -> str:
|
|
286
|
+
"""サブ項目のテキストを抽出(再帰的)"""
|
|
287
|
+
parts = []
|
|
288
|
+
|
|
289
|
+
# サブ項目のタイトルと内容
|
|
290
|
+
title = subitem.get("title", "")
|
|
291
|
+
content = subitem.get("content", "")
|
|
292
|
+
|
|
293
|
+
if title and content:
|
|
294
|
+
parts.append(f"{title} {content}")
|
|
295
|
+
elif content:
|
|
296
|
+
parts.append(content)
|
|
297
|
+
|
|
298
|
+
# ネストしたサブ項目
|
|
299
|
+
nested_subitems = subitem.get("subitems", [])
|
|
300
|
+
for nested_subitem in nested_subitems:
|
|
301
|
+
nested_text = self._extract_subitem_text(nested_subitem)
|
|
302
|
+
if nested_text:
|
|
303
|
+
parts.append(f" {nested_text}")
|
|
304
|
+
|
|
305
|
+
return "\n".join(parts) if parts else ""
|
|
306
|
+
|
|
307
|
+
def _extract_table_text(self, table: Dict[str, Any]) -> str:
|
|
308
|
+
"""表のテキストを抽出"""
|
|
309
|
+
rows = table.get("rows", [])
|
|
310
|
+
if not rows:
|
|
311
|
+
return ""
|
|
312
|
+
|
|
313
|
+
table_lines = []
|
|
314
|
+
for row in rows:
|
|
315
|
+
if isinstance(row, list):
|
|
316
|
+
# セル区切り文字として|を使用
|
|
317
|
+
table_lines.append("| " + " | ".join(str(cell) for cell in row) + " |")
|
|
318
|
+
|
|
319
|
+
return "\n".join(table_lines)
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
class LawExtractor(BaseExtractor):
|
|
323
|
+
"""法令本体からの関連条文抽出(yaml版)"""
|
|
324
|
+
|
|
325
|
+
def extract(self, state: GraphState) -> ExtractionResult:
|
|
326
|
+
"""法令から関連条文を抽出(2段階方式)"""
|
|
327
|
+
logger.info("法令からの関連条文抽出を開始")
|
|
328
|
+
|
|
329
|
+
# ステップ1: LLMで関連条文番号を特定
|
|
330
|
+
relevant_articles = self._identify_relevant_articles(state)
|
|
331
|
+
# print("relevant_articles = :: ", relevant_articles)
|
|
332
|
+
|
|
333
|
+
# ステップ2: YAML構造から該当条文を抽出
|
|
334
|
+
extracted_content = self._extract_articles_from_yaml(
|
|
335
|
+
state["law_document"], relevant_articles.article_numbers
|
|
336
|
+
)
|
|
337
|
+
# print("extracted_content = :: ", extracted_content)
|
|
338
|
+
|
|
339
|
+
return ExtractionResult(
|
|
340
|
+
content=extracted_content,
|
|
341
|
+
metadata={
|
|
342
|
+
"stage": "law_extraction",
|
|
343
|
+
"source_document": state["law_document"].name,
|
|
344
|
+
"target_articles": state["target_articles"],
|
|
345
|
+
"identified_articles": relevant_articles.article_numbers,
|
|
346
|
+
"extraction_reasoning": relevant_articles.extraction_reasoning,
|
|
347
|
+
},
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
def _identify_relevant_articles(self, state: GraphState) -> RelevantArticles:
|
|
351
|
+
"""LLMを使用して関連条文番号を特定"""
|
|
352
|
+
logger.info("LLMによる関連条文番号の特定を開始")
|
|
353
|
+
|
|
354
|
+
# プロンプトテンプレートを読み込み
|
|
355
|
+
base_context = flatten_state(state)
|
|
356
|
+
special_context = {
|
|
357
|
+
"law_name": state["law_document"].name,
|
|
358
|
+
"law_article": ", ".join(state["target_articles"]),
|
|
359
|
+
"law_text": state["law_document"].content,
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
formatted_prompt = self.prompt_manager.render_prompt(
|
|
363
|
+
self.prompt_name,
|
|
364
|
+
context={**base_context, **special_context},
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
# Structured outputでLLMを呼び出し
|
|
368
|
+
messages = self._create_messages(formatted_prompt)
|
|
369
|
+
|
|
370
|
+
# LLMをstructured outputモードに設定
|
|
371
|
+
structured_llm = self.llm.with_structured_output(RelevantArticles)
|
|
372
|
+
response: RelevantArticles = structured_llm.invoke(messages)
|
|
373
|
+
|
|
374
|
+
logger.info(f"特定された関連条文: {response.article_numbers}")
|
|
375
|
+
return response
|
|
376
|
+
|
|
377
|
+
def _extract_articles_from_yaml(
|
|
378
|
+
self, law_document: "LegalDocument", article_numbers: List[int]
|
|
379
|
+
) -> str:
|
|
380
|
+
"""YAML構造から指定された条文を抽出"""
|
|
381
|
+
logger.info(f"YAML構造から{len(article_numbers)}件の条文を抽出中")
|
|
382
|
+
|
|
383
|
+
# LegalDocumentからYAMLデータを取得
|
|
384
|
+
# 注意: LegalDocumentクラスにyaml_dataフィールドが必要
|
|
385
|
+
if not hasattr(law_document, "yaml_data") or law_document.yaml_data is None:
|
|
386
|
+
logger.error("法令文書にYAMLデータが含まれていません")
|
|
387
|
+
return "エラー: YAML構造データが利用できません"
|
|
388
|
+
|
|
389
|
+
# YAML抽出器を初期化
|
|
390
|
+
extractor = YamlArticleExtractor(law_document.yaml_data)
|
|
391
|
+
|
|
392
|
+
# 指定された条文を抽出
|
|
393
|
+
extracted_articles = extractor.extract_articles_by_numbers(article_numbers)
|
|
394
|
+
|
|
395
|
+
# 抽出結果をテキストにフォーマット
|
|
396
|
+
return self._format_extracted_articles(extracted_articles)
|
|
397
|
+
|
|
398
|
+
def _format_extracted_articles(
|
|
399
|
+
self, extracted_articles: List[ExtractedArticleContent]
|
|
400
|
+
) -> str:
|
|
401
|
+
"""抽出された条文をテキスト形式でフォーマット"""
|
|
402
|
+
formatted_parts = [" 法令本文:抽出された関連条項"]
|
|
403
|
+
|
|
404
|
+
for article in extracted_articles:
|
|
405
|
+
if article.found:
|
|
406
|
+
formatted_parts.append(f"\n{article.full_content}\n")
|
|
407
|
+
else:
|
|
408
|
+
formatted_parts.append(f"\n【注意】{article.full_content}\n")
|
|
409
|
+
|
|
410
|
+
return "\n".join(formatted_parts)
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
# 使用例とテスト
|
|
414
|
+
def test_law_extractor():
|
|
415
|
+
"""LawExtractorのテスト用関数"""
|
|
416
|
+
from unittest.mock import Mock
|
|
417
|
+
|
|
418
|
+
# サンプルYAMLデータ
|
|
419
|
+
sample_yaml_data = {
|
|
420
|
+
"law_info": {"title": "土壌汚染対策法", "law_num": "平成14年法律第53号"},
|
|
421
|
+
"articles": [
|
|
422
|
+
{
|
|
423
|
+
"article_num": 3,
|
|
424
|
+
"title": "土壌汚染状況調査",
|
|
425
|
+
"paragraphs": [
|
|
426
|
+
{
|
|
427
|
+
"paragraph_num": "1",
|
|
428
|
+
"content": "都道府県知事は、有害物質使用特定施設の使用が廃止されたときは、当該有害物質使用特定施設に係る工場又は事業場の敷地であった土地について、土壌汚染状況調査を行わせるものとする。",
|
|
429
|
+
"items": [
|
|
430
|
+
{
|
|
431
|
+
"item_num": 1,
|
|
432
|
+
"title": "一",
|
|
433
|
+
"content": "有害物質使用特定施設において製造、使用又は処理されていた物質",
|
|
434
|
+
}
|
|
435
|
+
],
|
|
436
|
+
}
|
|
437
|
+
],
|
|
438
|
+
},
|
|
439
|
+
{
|
|
440
|
+
"article_num": 4,
|
|
441
|
+
"title": "調査命令",
|
|
442
|
+
"paragraphs": [
|
|
443
|
+
{
|
|
444
|
+
"paragraph_num": "1",
|
|
445
|
+
"content": "都道府県知事は、土壌汚染により人の健康に係る被害が生ずるおそれがあるものとして環境省令で定める基準に該当する土地があると認めるときは、当該土地の所有者等に対し、土壌汚染状況調査を行うべきことを命ずることができる。",
|
|
446
|
+
}
|
|
447
|
+
],
|
|
448
|
+
},
|
|
449
|
+
],
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
# LegalDocumentを作成
|
|
453
|
+
law_document = LegalDocument(
|
|
454
|
+
name="土壌汚染対策法",
|
|
455
|
+
content="法令の本文テキスト...",
|
|
456
|
+
document_type="law",
|
|
457
|
+
yaml_data=sample_yaml_data,
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
# YamlArticleExtractorの動作テスト
|
|
461
|
+
extractor = YamlArticleExtractor(sample_yaml_data)
|
|
462
|
+
extracted = extractor.extract_articles_by_numbers([3, 4])
|
|
463
|
+
|
|
464
|
+
print("=== 条文抽出テスト結果 ===")
|
|
465
|
+
for article in extracted:
|
|
466
|
+
print(f"\n【第{article.article_num}条】")
|
|
467
|
+
print(f"タイトル: {article.title}")
|
|
468
|
+
print(f"見つかった: {article.found}")
|
|
469
|
+
print(f"内容:\n{article.full_content}")
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
class RegulationExtractor(BaseExtractor):
|
|
473
|
+
"""施行規則からの関連条文抽出(更新版)"""
|
|
474
|
+
|
|
475
|
+
def extract(self, state: GraphState) -> ExtractionResult:
|
|
476
|
+
"""施行規則から関連条文を抽出(2段階方式)"""
|
|
477
|
+
logger.info("施行規則からの関連条文抽出を開始")
|
|
478
|
+
|
|
479
|
+
# ステップ1: LLMで関連条文番号を特定
|
|
480
|
+
relevant_articles = self._identify_relevant_articles(state)
|
|
481
|
+
|
|
482
|
+
# ステップ2: YAML構造から該当条文を抽出
|
|
483
|
+
extracted_content = self._extract_articles_from_yaml(
|
|
484
|
+
state["regulation_document"], relevant_articles.article_numbers
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
return ExtractionResult(
|
|
488
|
+
content=extracted_content,
|
|
489
|
+
metadata={
|
|
490
|
+
"stage": "regulation_extraction",
|
|
491
|
+
"source_document": state["regulation_document"].name,
|
|
492
|
+
"law_reference": state["law_document"].name,
|
|
493
|
+
"identified_articles": relevant_articles.article_numbers,
|
|
494
|
+
"extraction_reasoning": relevant_articles.extraction_reasoning,
|
|
495
|
+
},
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
def _identify_relevant_articles(self, state: GraphState) -> RelevantArticles:
|
|
499
|
+
"""LLMを使用して関連条文番号を特定"""
|
|
500
|
+
logger.info("LLMによる施行規則の関連条文番号の特定を開始")
|
|
501
|
+
|
|
502
|
+
# プロンプトテンプレートを読み込み
|
|
503
|
+
base_context = flatten_state(state)
|
|
504
|
+
special_context = {
|
|
505
|
+
"law_name": state["law_document"].name,
|
|
506
|
+
"law_article": ", ".join(state["target_articles"]),
|
|
507
|
+
"extracted_law_content": state["extracted_law_content"],
|
|
508
|
+
"regulation_text": state["regulation_document"].content,
|
|
509
|
+
}
|
|
510
|
+
formatted_prompt = self.prompt_manager.render_prompt(
|
|
511
|
+
self.prompt_name,
|
|
512
|
+
context={**base_context, **special_context},
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
# Structured outputでLLMを呼び出し
|
|
516
|
+
messages = self._create_messages(formatted_prompt)
|
|
517
|
+
|
|
518
|
+
# LLMをstructured outputモードに設定
|
|
519
|
+
structured_llm = self.llm.with_structured_output(RelevantArticles)
|
|
520
|
+
response = structured_llm.invoke(messages)
|
|
521
|
+
|
|
522
|
+
logger.info(f"特定された関連条文: {response.article_numbers}")
|
|
523
|
+
return response
|
|
524
|
+
|
|
525
|
+
def _extract_articles_from_yaml(
|
|
526
|
+
self, regulation_document: "LegalDocument", article_numbers: List[int]
|
|
527
|
+
) -> str:
|
|
528
|
+
"""YAML構造から指定された条文を抽出"""
|
|
529
|
+
logger.info(f"施行規則のYAML構造から{len(article_numbers)}件の条文を抽出中")
|
|
530
|
+
|
|
531
|
+
# LegalDocumentからYAMLデータを取得
|
|
532
|
+
if (
|
|
533
|
+
not hasattr(regulation_document, "yaml_data")
|
|
534
|
+
or regulation_document.yaml_data is None
|
|
535
|
+
):
|
|
536
|
+
logger.error("施行規則文書にYAMLデータが含まれていません")
|
|
537
|
+
return "エラー: YAML構造データが利用できません"
|
|
538
|
+
|
|
539
|
+
# YAML抽出器を初期化
|
|
540
|
+
extractor = YamlArticleExtractor(regulation_document.yaml_data)
|
|
541
|
+
|
|
542
|
+
# 指定された条文を抽出
|
|
543
|
+
extracted_articles = extractor.extract_articles_by_numbers(article_numbers)
|
|
544
|
+
|
|
545
|
+
# 抽出結果をテキストにフォーマット
|
|
546
|
+
return self._format_extracted_articles(extracted_articles)
|
|
547
|
+
|
|
548
|
+
def _format_extracted_articles(
|
|
549
|
+
self, extracted_articles: List[ExtractedArticleContent]
|
|
550
|
+
) -> str:
|
|
551
|
+
"""抽出された条文をテキスト形式でフォーマット"""
|
|
552
|
+
formatted_parts = [" 施行規則:抽出された関連条項"]
|
|
553
|
+
|
|
554
|
+
for article in extracted_articles:
|
|
555
|
+
if article.found:
|
|
556
|
+
formatted_parts.append(f"\n{article.full_content}\n")
|
|
557
|
+
else:
|
|
558
|
+
formatted_parts.append(f"\n【注意】{article.full_content}\n")
|
|
559
|
+
|
|
560
|
+
return "\n".join(formatted_parts)
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
class GraphBuilder:
|
|
564
|
+
"""法令要点抽出のグラフビルダー(yaml対応版)"""
|
|
565
|
+
|
|
566
|
+
# 各LLM呼び出しのプロンプト名
|
|
567
|
+
DEFAULT_PROMPT_NAMES = {
|
|
568
|
+
"extract_law": "extract_laws_v001",
|
|
569
|
+
"extract_regulation": "extract_regulation_v001",
|
|
570
|
+
"generate_summary": "v003",
|
|
571
|
+
}
|
|
572
|
+
|
|
573
|
+
def __init__(
|
|
574
|
+
self,
|
|
575
|
+
llm: BaseLLM,
|
|
576
|
+
prompts_dir: Path = Path("prompts"),
|
|
577
|
+
prompt_names: Optional[Dict[str, str]] = None,
|
|
578
|
+
):
|
|
579
|
+
self.llm = llm
|
|
580
|
+
self.prompt_manager = PromptManager(prompts_dir)
|
|
581
|
+
|
|
582
|
+
# デフォルトとユーザ指定をマージ(ユーザ指定が優先)
|
|
583
|
+
self.prompt_names = {**self.DEFAULT_PROMPT_NAMES, **(prompt_names or {})}
|
|
584
|
+
|
|
585
|
+
# 各抽出器の初期化
|
|
586
|
+
self.law_extractor = LawExtractor(
|
|
587
|
+
llm, self.prompt_manager, self.prompt_names["extract_law"]
|
|
588
|
+
)
|
|
589
|
+
self.regulation_extractor = RegulationExtractor(
|
|
590
|
+
llm, self.prompt_manager, self.prompt_names["extract_regulation"]
|
|
591
|
+
)
|
|
592
|
+
self.summary_generator = ViewpointGenerator(
|
|
593
|
+
llm, self.prompt_manager, self.prompt_names["generate_summary"]
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
# グラフの構築
|
|
597
|
+
self.graph = self._build_graph()
|
|
598
|
+
|
|
599
|
+
def _build_graph(self): # TODO:: CompiledGraph型の戻り値を指定
|
|
600
|
+
"""LangGraphの構築"""
|
|
601
|
+
workflow = StateGraph(GraphState)
|
|
602
|
+
|
|
603
|
+
# ノードの追加
|
|
604
|
+
workflow.add_node("extract_law", self._extract_law_node)
|
|
605
|
+
workflow.add_node("extract_regulation", self._extract_regulation_node)
|
|
606
|
+
workflow.add_node("generate_summary", self._generate_summary_node)
|
|
607
|
+
workflow.add_node("handle_error", self._handle_error_node)
|
|
608
|
+
|
|
609
|
+
# エッジの設定
|
|
610
|
+
workflow.set_entry_point("extract_law")
|
|
611
|
+
|
|
612
|
+
workflow.add_conditional_edges(
|
|
613
|
+
"extract_law",
|
|
614
|
+
self._should_continue_to_regulation,
|
|
615
|
+
{"continue": "extract_regulation", "error": "handle_error"},
|
|
616
|
+
)
|
|
617
|
+
|
|
618
|
+
workflow.add_conditional_edges(
|
|
619
|
+
"extract_regulation",
|
|
620
|
+
self._should_continue_to_summary,
|
|
621
|
+
{"continue": "generate_summary", "error": "handle_error"},
|
|
622
|
+
)
|
|
623
|
+
|
|
624
|
+
workflow.add_edge("generate_summary", END)
|
|
625
|
+
workflow.add_edge("handle_error", END)
|
|
626
|
+
|
|
627
|
+
return workflow.compile()
|
|
628
|
+
|
|
629
|
+
def _extract_law_node(self, state: GraphState) -> GraphState:
|
|
630
|
+
"""法令抽出ノード"""
|
|
631
|
+
try:
|
|
632
|
+
result = self.law_extractor.extract(state)
|
|
633
|
+
state["extracted_law_content"] = result.content
|
|
634
|
+
state["current_stage"] = ProcessingStage.LAW_EXTRACTION
|
|
635
|
+
state["metadata"].update(result.metadata)
|
|
636
|
+
state["extracted_law_article_numbers"] = result.metadata[
|
|
637
|
+
"identified_articles"
|
|
638
|
+
]
|
|
639
|
+
logger.info("法令抽出が完了しました")
|
|
640
|
+
except Exception as e:
|
|
641
|
+
state["error_message"] = f"法令抽出エラー: {str(e)}"
|
|
642
|
+
logger.error(state["error_message"])
|
|
643
|
+
|
|
644
|
+
return state
|
|
645
|
+
|
|
646
|
+
def _extract_regulation_node(self, state: GraphState) -> GraphState:
|
|
647
|
+
"""施行規則抽出ノード"""
|
|
648
|
+
try:
|
|
649
|
+
result = self.regulation_extractor.extract(state)
|
|
650
|
+
state["extracted_regulation_content"] = result.content
|
|
651
|
+
state["current_stage"] = ProcessingStage.REGULATION_EXTRACTION
|
|
652
|
+
state["metadata"].update(result.metadata)
|
|
653
|
+
state["extracted_regulation_article_numbers"] = result.metadata[
|
|
654
|
+
"identified_articles"
|
|
655
|
+
]
|
|
656
|
+
logger.info("施行規則抽出が完了しました")
|
|
657
|
+
except Exception as e:
|
|
658
|
+
state["error_message"] = f"施行規則抽出エラー: {str(e)}"
|
|
659
|
+
logger.error(state["error_message"])
|
|
660
|
+
|
|
661
|
+
return state
|
|
662
|
+
|
|
663
|
+
def _generate_summary_node(self, state: GraphState) -> GraphState:
|
|
664
|
+
"""要点生成ノード"""
|
|
665
|
+
try:
|
|
666
|
+
result = self.summary_generator.extract(state)
|
|
667
|
+
state["final_summary"] = result.content
|
|
668
|
+
state["current_stage"] = ProcessingStage.COMPLETED
|
|
669
|
+
state["metadata"].update(result.metadata)
|
|
670
|
+
logger.info("要点生成が完了しました")
|
|
671
|
+
except Exception as e:
|
|
672
|
+
state["error_message"] = f"要点生成エラー: {str(e)}"
|
|
673
|
+
logger.error(state["error_message"])
|
|
674
|
+
|
|
675
|
+
return state
|
|
676
|
+
|
|
677
|
+
def _handle_error_node(self, state: GraphState) -> GraphState:
|
|
678
|
+
"""エラーハンドリングノード"""
|
|
679
|
+
logger.error(
|
|
680
|
+
f"処理中にエラーが発生しました: {state.get('error_message', '不明なエラー')}"
|
|
681
|
+
)
|
|
682
|
+
state["current_stage"] = ProcessingStage.COMPLETED
|
|
683
|
+
return state
|
|
684
|
+
|
|
685
|
+
def _should_continue_to_regulation(self, state: GraphState) -> str:
|
|
686
|
+
"""施行規則抽出への継続判定"""
|
|
687
|
+
return "error" if state.get("error_message") else "continue"
|
|
688
|
+
|
|
689
|
+
def _should_continue_to_summary(self, state: GraphState) -> str:
|
|
690
|
+
"""要点生成への継続判定"""
|
|
691
|
+
return "error" if state.get("error_message") else "continue"
|
|
692
|
+
|
|
693
|
+
def process(
|
|
694
|
+
self,
|
|
695
|
+
law_document: LegalDocument,
|
|
696
|
+
regulation_document: LegalDocument,
|
|
697
|
+
target_articles: List[str],
|
|
698
|
+
) -> GraphState:
|
|
699
|
+
"""処理の実行"""
|
|
700
|
+
initial_state = GraphState(
|
|
701
|
+
law_document=law_document,
|
|
702
|
+
regulation_document=regulation_document,
|
|
703
|
+
target_articles=target_articles,
|
|
704
|
+
extracted_law_content=None,
|
|
705
|
+
extracted_law_article_numbers=None,
|
|
706
|
+
extracted_regulation_content=None,
|
|
707
|
+
extracted_regulation_article_numbers=None,
|
|
708
|
+
final_summary=None,
|
|
709
|
+
current_stage=ProcessingStage.LAW_EXTRACTION,
|
|
710
|
+
error_message=None,
|
|
711
|
+
metadata={},
|
|
712
|
+
)
|
|
713
|
+
|
|
714
|
+
logger.info("法令判断軸抽出処理を開始します")
|
|
715
|
+
result = self.graph.invoke(initial_state)
|
|
716
|
+
logger.info(f"処理が完了しました。ステージ: {result['current_stage']}")
|
|
717
|
+
|
|
718
|
+
return result
|
|
719
|
+
|
|
720
|
+
|
|
721
|
+
class LegalExtractionConfig:
|
|
722
|
+
"""設定管理クラス"""
|
|
723
|
+
|
|
724
|
+
def __init__(
|
|
725
|
+
self,
|
|
726
|
+
llm,
|
|
727
|
+
prompts_dir: str = "prompts",
|
|
728
|
+
prompt_names: Optional[Dict[str, str]] = None,
|
|
729
|
+
):
|
|
730
|
+
self.llm = llm
|
|
731
|
+
self.prompts_dir = Path(prompts_dir)
|
|
732
|
+
self.prompt_names = prompt_names
|
|
733
|
+
|
|
734
|
+
|
|
735
|
+
def create_legal_extraction_system(config: LegalExtractionConfig) -> GraphBuilder:
|
|
736
|
+
"""法令要点抽出システムのファクトリー関数"""
|
|
737
|
+
return GraphBuilder(
|
|
738
|
+
llm=config.llm, prompts_dir=config.prompts_dir, prompt_names=config.prompt_names
|
|
739
|
+
)
|