jfinqa-helm 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jfinqa_helm-0.1.0/.gitignore +9 -0
- jfinqa_helm-0.1.0/PKG-INFO +50 -0
- jfinqa_helm-0.1.0/README.md +31 -0
- jfinqa_helm-0.1.0/pyproject.toml +37 -0
- jfinqa_helm-0.1.0/src/jfinqa_helm/__init__.py +1 -0
- jfinqa_helm-0.1.0/src/jfinqa_helm/run_specs.py +30 -0
- jfinqa_helm-0.1.0/src/jfinqa_helm/scenario.py +97 -0
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: jfinqa-helm
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: HELM plugin for JFinQA: Japanese Financial Numerical Reasoning QA Benchmark
|
|
5
|
+
Project-URL: Homepage, https://github.com/ajtgjmdjp/jfinqa
|
|
6
|
+
Project-URL: Dataset, https://huggingface.co/datasets/ajtgjmdjp/jfinqa
|
|
7
|
+
Author: ajtgjmdjp
|
|
8
|
+
License: Apache-2.0
|
|
9
|
+
Keywords: benchmark,finance,helm,japanese,nlp
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
15
|
+
Requires-Python: >=3.9
|
|
16
|
+
Requires-Dist: crfm-helm>=0.5.0
|
|
17
|
+
Requires-Dist: datasets>=2.0
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
|
|
20
|
+
# jfinqa-helm
|
|
21
|
+
|
|
22
|
+
HELM plugin for [JFinQA](https://github.com/ajtgjmdjp/jfinqa): Japanese Financial Numerical Reasoning QA Benchmark.
|
|
23
|
+
|
|
24
|
+
## Installation
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install crfm-helm jfinqa-helm
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Usage
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
helm-run --run-entries jfinqa:model=openai/gpt-4o
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## About JFinQA
|
|
37
|
+
|
|
38
|
+
JFinQA contains 1,000 questions across three subtasks:
|
|
39
|
+
|
|
40
|
+
- **numerical_reasoning** (550 questions): Calculate financial ratios, growth rates, etc.
|
|
41
|
+
- **consistency_checking** (200 questions): Verify whether a statement is consistent with financial data
|
|
42
|
+
- **temporal_reasoning** (250 questions): Reason about changes over multiple fiscal years
|
|
43
|
+
|
|
44
|
+
Questions are drawn from 68 companies' EDINET filings covering J-GAAP, IFRS, and US-GAAP.
|
|
45
|
+
|
|
46
|
+
**Dataset**: [ajtgjmdjp/jfinqa](https://huggingface.co/datasets/ajtgjmdjp/jfinqa)
|
|
47
|
+
|
|
48
|
+
## License
|
|
49
|
+
|
|
50
|
+
Apache-2.0
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# jfinqa-helm
|
|
2
|
+
|
|
3
|
+
HELM plugin for [JFinQA](https://github.com/ajtgjmdjp/jfinqa): Japanese Financial Numerical Reasoning QA Benchmark.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install crfm-helm jfinqa-helm
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
helm-run --run-entries jfinqa:model=openai/gpt-4o
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## About JFinQA
|
|
18
|
+
|
|
19
|
+
JFinQA contains 1,000 questions across three subtasks:
|
|
20
|
+
|
|
21
|
+
- **numerical_reasoning** (550 questions): Calculate financial ratios, growth rates, etc.
|
|
22
|
+
- **consistency_checking** (200 questions): Verify whether a statement is consistent with financial data
|
|
23
|
+
- **temporal_reasoning** (250 questions): Reason about changes over multiple fiscal years
|
|
24
|
+
|
|
25
|
+
Questions are drawn from 68 companies' EDINET filings covering J-GAAP, IFRS, and US-GAAP.
|
|
26
|
+
|
|
27
|
+
**Dataset**: [ajtgjmdjp/jfinqa](https://huggingface.co/datasets/ajtgjmdjp/jfinqa)
|
|
28
|
+
|
|
29
|
+
## License
|
|
30
|
+
|
|
31
|
+
Apache-2.0
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "jfinqa-helm"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "HELM plugin for JFinQA: Japanese Financial Numerical Reasoning QA Benchmark"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { text = "Apache-2.0" }
|
|
11
|
+
authors = [{ name = "ajtgjmdjp" }]
|
|
12
|
+
requires-python = ">=3.9"
|
|
13
|
+
dependencies = [
|
|
14
|
+
"crfm-helm>=0.5.0",
|
|
15
|
+
"datasets>=2.0",
|
|
16
|
+
]
|
|
17
|
+
keywords = ["helm", "benchmark", "japanese", "finance", "nlp"]
|
|
18
|
+
classifiers = [
|
|
19
|
+
"Development Status :: 3 - Alpha",
|
|
20
|
+
"Intended Audience :: Science/Research",
|
|
21
|
+
"License :: OSI Approved :: Apache Software License",
|
|
22
|
+
"Programming Language :: Python :: 3",
|
|
23
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
[project.urls]
|
|
27
|
+
Homepage = "https://github.com/ajtgjmdjp/jfinqa"
|
|
28
|
+
Dataset = "https://huggingface.co/datasets/ajtgjmdjp/jfinqa"
|
|
29
|
+
|
|
30
|
+
[project.entry-points.helm]
|
|
31
|
+
jfinqa = "jfinqa_helm.run_specs"
|
|
32
|
+
|
|
33
|
+
[tool.hatch.build.targets.wheel]
|
|
34
|
+
packages = ["src/jfinqa_helm"]
|
|
35
|
+
|
|
36
|
+
[tool.hatch.build.targets.sdist]
|
|
37
|
+
exclude = [".env", ".env.*", ".claude/"]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""HELM plugin for JFinQA: Japanese Financial Numerical Reasoning QA Benchmark."""
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Run spec for JFinQA HELM plugin."""
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.adaptation.common_adapter_specs import get_generation_adapter_spec
|
|
4
|
+
from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs
|
|
5
|
+
from helm.benchmark.run_spec import RunSpec, run_spec_function
|
|
6
|
+
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@run_spec_function("jfinqa")
|
|
10
|
+
def get_jfinqa_spec() -> RunSpec:
|
|
11
|
+
scenario_spec = ScenarioSpec(
|
|
12
|
+
class_name="jfinqa_helm.scenario.JFinQAScenario", args={}
|
|
13
|
+
)
|
|
14
|
+
adapter_spec = get_generation_adapter_spec(
|
|
15
|
+
instructions=(
|
|
16
|
+
"以下の財務データを読み、質問に正確な数値で答えてください。\n"
|
|
17
|
+
"Read the following financial data and answer the question with the exact numeric value.\n"
|
|
18
|
+
),
|
|
19
|
+
input_noun=None,
|
|
20
|
+
output_noun="Answer",
|
|
21
|
+
max_tokens=50,
|
|
22
|
+
)
|
|
23
|
+
metric_specs = get_basic_metric_specs([])
|
|
24
|
+
return RunSpec(
|
|
25
|
+
name="jfinqa",
|
|
26
|
+
scenario_spec=scenario_spec,
|
|
27
|
+
adapter_spec=adapter_spec,
|
|
28
|
+
metric_specs=metric_specs,
|
|
29
|
+
groups=["jfinqa"],
|
|
30
|
+
)
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""JFinQA: Japanese Financial Numerical Reasoning QA Benchmark.
|
|
2
|
+
|
|
3
|
+
Data source:
|
|
4
|
+
https://huggingface.co/datasets/ajtgjmdjp/jfinqa
|
|
5
|
+
|
|
6
|
+
JFinQA is a benchmark for numerical reasoning over Japanese corporate
|
|
7
|
+
financial disclosures. It contains 1,000 questions across three subtasks
|
|
8
|
+
—numerical reasoning (550), consistency checking (200), and temporal
|
|
9
|
+
reasoning (250)—drawn from 68 companies' EDINET filings covering
|
|
10
|
+
J-GAAP, IFRS, and US-GAAP.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import os
|
|
14
|
+
from typing import Any, Dict, List
|
|
15
|
+
|
|
16
|
+
from datasets import load_dataset
|
|
17
|
+
|
|
18
|
+
from helm.benchmark.scenarios.scenario import (
|
|
19
|
+
CORRECT_TAG,
|
|
20
|
+
TEST_SPLIT,
|
|
21
|
+
Input,
|
|
22
|
+
Instance,
|
|
23
|
+
Output,
|
|
24
|
+
Reference,
|
|
25
|
+
Scenario,
|
|
26
|
+
)
|
|
27
|
+
from helm.common.general import ensure_directory_exists
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class JFinQAScenario(Scenario):
|
|
31
|
+
"""Japanese Financial Numerical Reasoning QA."""
|
|
32
|
+
|
|
33
|
+
name = "jfinqa"
|
|
34
|
+
description = (
|
|
35
|
+
"JFinQA: Japanese Financial Numerical Reasoning QA — "
|
|
36
|
+
"1,000 questions across numerical reasoning, consistency checking, "
|
|
37
|
+
"and temporal reasoning from 68 companies' EDINET filings."
|
|
38
|
+
)
|
|
39
|
+
tags = ["question_answering", "finance", "japanese"]
|
|
40
|
+
|
|
41
|
+
HF_DATASET_ID = "ajtgjmdjp/jfinqa"
|
|
42
|
+
SUBSETS = ("numerical_reasoning", "consistency_checking", "temporal_reasoning")
|
|
43
|
+
|
|
44
|
+
@staticmethod
|
|
45
|
+
def _format_table(headers: List[str], rows: List[List[str]]) -> str:
|
|
46
|
+
header_line = "| " + " | ".join(str(h) for h in headers) + " |"
|
|
47
|
+
sep_line = "| " + " | ".join("---" for _ in headers) + " |"
|
|
48
|
+
row_lines = ["| " + " | ".join(str(c) for c in row) + " |" for row in rows]
|
|
49
|
+
return "\n".join([header_line, sep_line, *row_lines])
|
|
50
|
+
|
|
51
|
+
@staticmethod
|
|
52
|
+
def _build_input(row: Dict[str, Any]) -> str:
|
|
53
|
+
parts: List[str] = []
|
|
54
|
+
|
|
55
|
+
pre_text = row.get("pre_text", [])
|
|
56
|
+
if pre_text:
|
|
57
|
+
parts.append("\n".join(pre_text))
|
|
58
|
+
|
|
59
|
+
headers = row.get("table_headers", [])
|
|
60
|
+
rows = row.get("table_rows", [])
|
|
61
|
+
if headers:
|
|
62
|
+
parts.append(JFinQAScenario._format_table(headers, rows))
|
|
63
|
+
|
|
64
|
+
post_text = row.get("post_text", [])
|
|
65
|
+
if post_text:
|
|
66
|
+
parts.append("\n".join(post_text))
|
|
67
|
+
|
|
68
|
+
question = row.get("question", "")
|
|
69
|
+
parts.append(f"Question: {question}")
|
|
70
|
+
|
|
71
|
+
return "\n\n".join(parts)
|
|
72
|
+
|
|
73
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
74
|
+
cache_dir = os.path.join(output_path, "data")
|
|
75
|
+
ensure_directory_exists(cache_dir)
|
|
76
|
+
|
|
77
|
+
instances: List[Instance] = []
|
|
78
|
+
for subset in self.SUBSETS:
|
|
79
|
+
dataset = load_dataset(
|
|
80
|
+
self.HF_DATASET_ID,
|
|
81
|
+
subset,
|
|
82
|
+
split="test",
|
|
83
|
+
cache_dir=cache_dir,
|
|
84
|
+
trust_remote_code=True,
|
|
85
|
+
)
|
|
86
|
+
for row in dataset:
|
|
87
|
+
input_text = self._build_input(row)
|
|
88
|
+
answer = str(row["answer"])
|
|
89
|
+
instances.append(
|
|
90
|
+
Instance(
|
|
91
|
+
input=Input(text=input_text),
|
|
92
|
+
references=[Reference(Output(text=answer), tags=[CORRECT_TAG])],
|
|
93
|
+
split=TEST_SPLIT,
|
|
94
|
+
id=str(row.get("id", "")),
|
|
95
|
+
)
|
|
96
|
+
)
|
|
97
|
+
return instances
|