jfinqa-helm 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,9 @@
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ dist/
5
+ build/
6
+ *.egg-info/
7
+ .venv/
8
+ .env
9
+ .env.*
@@ -0,0 +1,50 @@
1
+ Metadata-Version: 2.4
2
+ Name: jfinqa-helm
3
+ Version: 0.1.0
4
+ Summary: HELM plugin for JFinQA: Japanese Financial Numerical Reasoning QA Benchmark
5
+ Project-URL: Homepage, https://github.com/ajtgjmdjp/jfinqa
6
+ Project-URL: Dataset, https://huggingface.co/datasets/ajtgjmdjp/jfinqa
7
+ Author: ajtgjmdjp
8
+ License: Apache-2.0
9
+ Keywords: benchmark,finance,helm,japanese,nlp
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: Apache Software License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
15
+ Requires-Python: >=3.9
16
+ Requires-Dist: crfm-helm>=0.5.0
17
+ Requires-Dist: datasets>=2.0
18
+ Description-Content-Type: text/markdown
19
+
20
+ # jfinqa-helm
21
+
22
+ HELM plugin for [JFinQA](https://github.com/ajtgjmdjp/jfinqa): Japanese Financial Numerical Reasoning QA Benchmark.
23
+
24
+ ## Installation
25
+
26
+ ```bash
27
+ pip install crfm-helm jfinqa-helm
28
+ ```
29
+
30
+ ## Usage
31
+
32
+ ```bash
33
+ helm-run --run-entries jfinqa:model=openai/gpt-4o
34
+ ```
35
+
36
+ ## About JFinQA
37
+
38
+ JFinQA contains 1,000 questions across three subtasks:
39
+
40
+ - **numerical_reasoning** (550 questions): Calculate financial ratios, growth rates, etc.
41
+ - **consistency_checking** (200 questions): Verify whether a statement is consistent with financial data
42
+ - **temporal_reasoning** (250 questions): Reason about changes over multiple fiscal years
43
+
44
+ Questions are drawn from 68 companies' EDINET filings covering J-GAAP, IFRS, and US-GAAP.
45
+
46
+ **Dataset**: [ajtgjmdjp/jfinqa](https://huggingface.co/datasets/ajtgjmdjp/jfinqa)
47
+
48
+ ## License
49
+
50
+ Apache-2.0
@@ -0,0 +1,31 @@
1
+ # jfinqa-helm
2
+
3
+ HELM plugin for [JFinQA](https://github.com/ajtgjmdjp/jfinqa): Japanese Financial Numerical Reasoning QA Benchmark.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install crfm-helm jfinqa-helm
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ ```bash
14
+ helm-run --run-entries jfinqa:model=openai/gpt-4o
15
+ ```
16
+
17
+ ## About JFinQA
18
+
19
+ JFinQA contains 1,000 questions across three subtasks:
20
+
21
+ - **numerical_reasoning** (550 questions): Calculate financial ratios, growth rates, etc.
22
+ - **consistency_checking** (200 questions): Verify whether a statement is consistent with financial data
23
+ - **temporal_reasoning** (250 questions): Reason about changes over multiple fiscal years
24
+
25
+ Questions are drawn from 68 companies' EDINET filings covering J-GAAP, IFRS, and US-GAAP.
26
+
27
+ **Dataset**: [ajtgjmdjp/jfinqa](https://huggingface.co/datasets/ajtgjmdjp/jfinqa)
28
+
29
+ ## License
30
+
31
+ Apache-2.0
@@ -0,0 +1,37 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "jfinqa-helm"
7
+ version = "0.1.0"
8
+ description = "HELM plugin for JFinQA: Japanese Financial Numerical Reasoning QA Benchmark"
9
+ readme = "README.md"
10
+ license = { text = "Apache-2.0" }
11
+ authors = [{ name = "ajtgjmdjp" }]
12
+ requires-python = ">=3.9"
13
+ dependencies = [
14
+ "crfm-helm>=0.5.0",
15
+ "datasets>=2.0",
16
+ ]
17
+ keywords = ["helm", "benchmark", "japanese", "finance", "nlp"]
18
+ classifiers = [
19
+ "Development Status :: 3 - Alpha",
20
+ "Intended Audience :: Science/Research",
21
+ "License :: OSI Approved :: Apache Software License",
22
+ "Programming Language :: Python :: 3",
23
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
24
+ ]
25
+
26
+ [project.urls]
27
+ Homepage = "https://github.com/ajtgjmdjp/jfinqa"
28
+ Dataset = "https://huggingface.co/datasets/ajtgjmdjp/jfinqa"
29
+
30
+ [project.entry-points.helm]
31
+ jfinqa = "jfinqa_helm.run_specs"
32
+
33
+ [tool.hatch.build.targets.wheel]
34
+ packages = ["src/jfinqa_helm"]
35
+
36
+ [tool.hatch.build.targets.sdist]
37
+ exclude = [".env", ".env.*", ".claude/"]
@@ -0,0 +1 @@
1
+ """HELM plugin for JFinQA: Japanese Financial Numerical Reasoning QA Benchmark."""
@@ -0,0 +1,30 @@
1
+ """Run spec for JFinQA HELM plugin."""
2
+
3
+ from helm.benchmark.adaptation.common_adapter_specs import get_generation_adapter_spec
4
+ from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs
5
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
6
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
7
+
8
+
9
+ @run_spec_function("jfinqa")
10
+ def get_jfinqa_spec() -> RunSpec:
11
+ scenario_spec = ScenarioSpec(
12
+ class_name="jfinqa_helm.scenario.JFinQAScenario", args={}
13
+ )
14
+ adapter_spec = get_generation_adapter_spec(
15
+ instructions=(
16
+ "以下の財務データを読み、質問に正確な数値で答えてください。\n"
17
+ "Read the following financial data and answer the question with the exact numeric value.\n"
18
+ ),
19
+ input_noun=None,
20
+ output_noun="Answer",
21
+ max_tokens=50,
22
+ )
23
+ metric_specs = get_basic_metric_specs([])
24
+ return RunSpec(
25
+ name="jfinqa",
26
+ scenario_spec=scenario_spec,
27
+ adapter_spec=adapter_spec,
28
+ metric_specs=metric_specs,
29
+ groups=["jfinqa"],
30
+ )
@@ -0,0 +1,97 @@
1
+ """JFinQA: Japanese Financial Numerical Reasoning QA Benchmark.
2
+
3
+ Data source:
4
+ https://huggingface.co/datasets/ajtgjmdjp/jfinqa
5
+
6
+ JFinQA is a benchmark for numerical reasoning over Japanese corporate
7
+ financial disclosures. It contains 1,000 questions across three subtasks
8
+ —numerical reasoning (550), consistency checking (200), and temporal
9
+ reasoning (250)—drawn from 68 companies' EDINET filings covering
10
+ J-GAAP, IFRS, and US-GAAP.
11
+ """
12
+
13
+ import os
14
+ from typing import Any, Dict, List
15
+
16
+ from datasets import load_dataset
17
+
18
+ from helm.benchmark.scenarios.scenario import (
19
+ CORRECT_TAG,
20
+ TEST_SPLIT,
21
+ Input,
22
+ Instance,
23
+ Output,
24
+ Reference,
25
+ Scenario,
26
+ )
27
+ from helm.common.general import ensure_directory_exists
28
+
29
+
30
+ class JFinQAScenario(Scenario):
31
+ """Japanese Financial Numerical Reasoning QA."""
32
+
33
+ name = "jfinqa"
34
+ description = (
35
+ "JFinQA: Japanese Financial Numerical Reasoning QA — "
36
+ "1,000 questions across numerical reasoning, consistency checking, "
37
+ "and temporal reasoning from 68 companies' EDINET filings."
38
+ )
39
+ tags = ["question_answering", "finance", "japanese"]
40
+
41
+ HF_DATASET_ID = "ajtgjmdjp/jfinqa"
42
+ SUBSETS = ("numerical_reasoning", "consistency_checking", "temporal_reasoning")
43
+
44
+ @staticmethod
45
+ def _format_table(headers: List[str], rows: List[List[str]]) -> str:
46
+ header_line = "| " + " | ".join(str(h) for h in headers) + " |"
47
+ sep_line = "| " + " | ".join("---" for _ in headers) + " |"
48
+ row_lines = ["| " + " | ".join(str(c) for c in row) + " |" for row in rows]
49
+ return "\n".join([header_line, sep_line, *row_lines])
50
+
51
+ @staticmethod
52
+ def _build_input(row: Dict[str, Any]) -> str:
53
+ parts: List[str] = []
54
+
55
+ pre_text = row.get("pre_text", [])
56
+ if pre_text:
57
+ parts.append("\n".join(pre_text))
58
+
59
+ headers = row.get("table_headers", [])
60
+ rows = row.get("table_rows", [])
61
+ if headers:
62
+ parts.append(JFinQAScenario._format_table(headers, rows))
63
+
64
+ post_text = row.get("post_text", [])
65
+ if post_text:
66
+ parts.append("\n".join(post_text))
67
+
68
+ question = row.get("question", "")
69
+ parts.append(f"Question: {question}")
70
+
71
+ return "\n\n".join(parts)
72
+
73
+ def get_instances(self, output_path: str) -> List[Instance]:
74
+ cache_dir = os.path.join(output_path, "data")
75
+ ensure_directory_exists(cache_dir)
76
+
77
+ instances: List[Instance] = []
78
+ for subset in self.SUBSETS:
79
+ dataset = load_dataset(
80
+ self.HF_DATASET_ID,
81
+ subset,
82
+ split="test",
83
+ cache_dir=cache_dir,
84
+ trust_remote_code=True,
85
+ )
86
+ for row in dataset:
87
+ input_text = self._build_input(row)
88
+ answer = str(row["answer"])
89
+ instances.append(
90
+ Instance(
91
+ input=Input(text=input_text),
92
+ references=[Reference(Output(text=answer), tags=[CORRECT_TAG])],
93
+ split=TEST_SPLIT,
94
+ id=str(row.get("id", "")),
95
+ )
96
+ )
97
+ return instances