mathipy 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mathipy-0.1.0/LICENSE +21 -0
- mathipy-0.1.0/PKG-INFO +85 -0
- mathipy-0.1.0/README.md +37 -0
- mathipy-0.1.0/mathipy/__init__.py +19 -0
- mathipy-0.1.0/mathipy/cognitive_load.py +106 -0
- mathipy-0.1.0/mathipy/data/2017-4M1 #4.png +0 -0
- mathipy-0.1.0/mathipy/data/2017-8M3 #2.png +0 -0
- mathipy-0.1.0/mathipy/data/2022-8M1 #2.png +0 -0
- mathipy-0.1.0/mathipy/data/2024-4M10 #2.png +0 -0
- mathipy-0.1.0/mathipy/data/2024-4M13 #2.png +0 -0
- mathipy-0.1.0/mathipy/data/SOURCE.md +9 -0
- mathipy-0.1.0/mathipy/data/__init__.py +26 -0
- mathipy-0.1.0/mathipy/data/naep_sample.csv +6 -0
- mathipy-0.1.0/mathipy/math_content.py +213 -0
- mathipy-0.1.0/mathipy/ocr.py +676 -0
- mathipy-0.1.0/mathipy/py.typed +0 -0
- mathipy-0.1.0/mathipy/readability.py +173 -0
- mathipy-0.1.0/mathipy/utils.py +60 -0
- mathipy-0.1.0/mathipy/visual.py +279 -0
- mathipy-0.1.0/mathipy.egg-info/PKG-INFO +85 -0
- mathipy-0.1.0/mathipy.egg-info/SOURCES.txt +29 -0
- mathipy-0.1.0/mathipy.egg-info/dependency_links.txt +1 -0
- mathipy-0.1.0/mathipy.egg-info/requires.txt +29 -0
- mathipy-0.1.0/mathipy.egg-info/top_level.txt +1 -0
- mathipy-0.1.0/pyproject.toml +87 -0
- mathipy-0.1.0/setup.cfg +4 -0
- mathipy-0.1.0/tests/test_cognitive_load.py +148 -0
- mathipy-0.1.0/tests/test_math_content.py +186 -0
- mathipy-0.1.0/tests/test_ocr.py +409 -0
- mathipy-0.1.0/tests/test_readability.py +152 -0
- mathipy-0.1.0/tests/test_visual.py +242 -0
mathipy-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Mikyung Shin
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
mathipy-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mathipy
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Multimodal item feature extraction for K-12 math assessment
|
|
5
|
+
Author-email: Mikyung Shin <shin.mikyung@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/mshin77/mathipy
|
|
8
|
+
Project-URL: Repository, https://github.com/mshin77/mathipy
|
|
9
|
+
Project-URL: Documentation, https://mshin77.github.io/mathipy
|
|
10
|
+
Project-URL: Issues, https://github.com/mshin77/mathipy/issues
|
|
11
|
+
Keywords: math,assessment,education,readability,k-12
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: Intended Audience :: Education
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Topic :: Education
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering
|
|
22
|
+
Requires-Python: >=3.9
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: numpy>=1.20.0
|
|
26
|
+
Provides-Extra: nlp
|
|
27
|
+
Requires-Dist: textstat>=0.7.0; extra == "nlp"
|
|
28
|
+
Requires-Dist: nltk>=3.8.0; extra == "nlp"
|
|
29
|
+
Provides-Extra: vision
|
|
30
|
+
Requires-Dist: pillow>=9.0.0; extra == "vision"
|
|
31
|
+
Requires-Dist: opencv-python-headless>=4.5.0; extra == "vision"
|
|
32
|
+
Provides-Extra: ocr
|
|
33
|
+
Requires-Dist: httpx>=0.24.0; extra == "ocr"
|
|
34
|
+
Provides-Extra: documents
|
|
35
|
+
Requires-Dist: python-docx>=0.8.0; extra == "documents"
|
|
36
|
+
Requires-Dist: pdfplumber>=0.7.0; extra == "documents"
|
|
37
|
+
Provides-Extra: all
|
|
38
|
+
Requires-Dist: mathipy[documents,nlp,ocr,vision]; extra == "all"
|
|
39
|
+
Provides-Extra: dev
|
|
40
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
41
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
42
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
43
|
+
Provides-Extra: docs
|
|
44
|
+
Requires-Dist: sphinx>=8.2; extra == "docs"
|
|
45
|
+
Requires-Dist: pydata-sphinx-theme>=0.16; extra == "docs"
|
|
46
|
+
Requires-Dist: myst-parser>=3.0; extra == "docs"
|
|
47
|
+
Dynamic: license-file
|
|
48
|
+
|
|
49
|
+
<img src="docs_src/_static/logo.svg" alt="MathiPy Logo" align="right" width="220px"/>
|
|
50
|
+
|
|
51
|
+
[](https://pypi.org/project/mathipy/)
|
|
52
|
+
[](https://pypi.org/project/mathipy/)
|
|
53
|
+
[](https://opensource.org/licenses/MIT)
|
|
54
|
+
|
|
55
|
+
Multimodal item feature extraction for K-12 math assessment. Analyze readability with math-aware normalization via [textstat](https://github.com/textstat/textstat) and [NLTK](https://www.nltk.org/), classify math content by [Common Core State Standards for Mathematics](https://www.thecorestandards.org/Math/) domain, estimate cognitive load components, extract visual complexity features from images using [OpenCV](https://opencv.org/) and [Pillow](https://pillow.readthedocs.io/), and perform multimodal optical character recognition (OCR) through [Gemini](https://ai.google.dev/) and [OpenAI](https://platform.openai.com/) vision APIs.
|
|
56
|
+
|
|
57
|
+
## Installation
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install mathipy
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
With optional dependencies:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install mathipy[nlp] # readability (textstat, nltk)
|
|
67
|
+
pip install mathipy[vision] # visual analysis (opencv, pillow)
|
|
68
|
+
pip install mathipy[ocr] # OCR via vision LLMs (httpx)
|
|
69
|
+
pip install mathipy[documents] # document parsing (python-docx, pdfplumber)
|
|
70
|
+
pip install mathipy[all] # all features
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
From GitHub:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
pip install git+https://github.com/mshin77/mathipy.git[all]
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Getting Started
|
|
80
|
+
|
|
81
|
+
See [Quick Start](https://mshin77.github.io/mathipy/getting-started.html) and [Analyzing Math Items](https://mshin77.github.io/mathipy/vignettes/naep-demo.html) for tutorials.
|
|
82
|
+
|
|
83
|
+
## Citation
|
|
84
|
+
|
|
85
|
+
- Shin, M. (2026). *MathiPy: Multimodal item feature extraction for K-12 math assessment* (Python package version 0.1.0) [Computer software]. <https://github.com/mshin77/mathipy>
|
mathipy-0.1.0/README.md
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
<img src="docs_src/_static/logo.svg" alt="MathiPy Logo" align="right" width="220px"/>
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/mathipy/)
|
|
4
|
+
[](https://pypi.org/project/mathipy/)
|
|
5
|
+
[](https://opensource.org/licenses/MIT)
|
|
6
|
+
|
|
7
|
+
Multimodal item feature extraction for K-12 math assessment. Analyze readability with math-aware normalization via [textstat](https://github.com/textstat/textstat) and [NLTK](https://www.nltk.org/), classify math content by [Common Core State Standards for Mathematics](https://www.thecorestandards.org/Math/) domain, estimate cognitive load components, extract visual complexity features from images using [OpenCV](https://opencv.org/) and [Pillow](https://pillow.readthedocs.io/), and perform multimodal optical character recognition (OCR) through [Gemini](https://ai.google.dev/) and [OpenAI](https://platform.openai.com/) vision APIs.
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install mathipy
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
With optional dependencies:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install mathipy[nlp] # readability (textstat, nltk)
|
|
19
|
+
pip install mathipy[vision] # visual analysis (opencv, pillow)
|
|
20
|
+
pip install mathipy[ocr] # OCR via vision LLMs (httpx)
|
|
21
|
+
pip install mathipy[documents] # document parsing (python-docx, pdfplumber)
|
|
22
|
+
pip install mathipy[all] # all features
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
From GitHub:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install git+https://github.com/mshin77/mathipy.git[all]
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Getting Started
|
|
32
|
+
|
|
33
|
+
See [Quick Start](https://mshin77.github.io/mathipy/getting-started.html) and [Analyzing Math Items](https://mshin77.github.io/mathipy/vignettes/naep-demo.html) for tutorials.
|
|
34
|
+
|
|
35
|
+
## Citation
|
|
36
|
+
|
|
37
|
+
- Shin, M. (2026). *MathiPy: Multimodal item feature extraction for K-12 math assessment* (Python package version 0.1.0) [Computer software]. <https://github.com/mshin77/mathipy>
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""MathiPy - Multimodal item feature extraction for K-12 math assessment."""
|
|
2
|
+
|
|
3
|
+
__version__ = "0.1.0"
|
|
4
|
+
__author__ = "Mikyung Shin"
|
|
5
|
+
__email__ = "shin.mikyung@gmail.com"
|
|
6
|
+
|
|
7
|
+
from mathipy.readability import ReadabilityAnalyzer
|
|
8
|
+
from mathipy.math_content import MathContentAnalyzer
|
|
9
|
+
from mathipy.cognitive_load import CognitiveLoadEstimator
|
|
10
|
+
from mathipy.visual import VisualFeatureExtractor
|
|
11
|
+
from mathipy.ocr import MultimodalOCR
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"ReadabilityAnalyzer",
|
|
15
|
+
"MathContentAnalyzer",
|
|
16
|
+
"CognitiveLoadEstimator",
|
|
17
|
+
"VisualFeatureExtractor",
|
|
18
|
+
"MultimodalOCR",
|
|
19
|
+
]
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""Cognitive load estimation for mathematical assessment items."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import logging
|
|
5
|
+
from typing import Any, Dict, List, Optional
|
|
6
|
+
|
|
7
|
+
from mathipy.utils import extract_numbers, extract_variables
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class CognitiveLoadEstimator:
|
|
13
|
+
"""Estimate cognitive load components for math assessment items.
|
|
14
|
+
|
|
15
|
+
Computes intrinsic (item complexity), extraneous (linguistic demand),
|
|
16
|
+
and germane (schema-building) load from text features.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def estimate(
|
|
20
|
+
self,
|
|
21
|
+
text: str,
|
|
22
|
+
readability_grade: Optional[float] = None,
|
|
23
|
+
math_terms: Optional[List[str]] = None,
|
|
24
|
+
) -> Dict[str, Any]:
|
|
25
|
+
"""Estimate cognitive load for the given text.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
text: Input text to analyze.
|
|
29
|
+
readability_grade: Optional Flesch-Kincaid grade level. Estimated from text if not provided.
|
|
30
|
+
math_terms: Optional list of math terms found in the text. Estimated from keywords if not provided.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
Dictionary with ``intrinsic_cognitive_load``, ``extraneous_cognitive_load``,
|
|
34
|
+
``germane_cognitive_load``, ``total_cognitive_load``, and element counts.
|
|
35
|
+
"""
|
|
36
|
+
if not text or not text.strip():
|
|
37
|
+
return self._empty_estimate()
|
|
38
|
+
|
|
39
|
+
numbers = extract_numbers(text)
|
|
40
|
+
variables = extract_variables(text)
|
|
41
|
+
operations = sum(1 for c in text if c in "+-*/^=<>")
|
|
42
|
+
word_count = len(text.split())
|
|
43
|
+
|
|
44
|
+
intrinsic = (len(numbers) + len(variables)) / word_count if word_count else 0
|
|
45
|
+
intrinsic = min(1.0, intrinsic * 2)
|
|
46
|
+
|
|
47
|
+
if readability_grade is not None:
|
|
48
|
+
extraneous = min(1.0, readability_grade / 12)
|
|
49
|
+
else:
|
|
50
|
+
extraneous = self._estimate_extraneous(text)
|
|
51
|
+
|
|
52
|
+
math_term_count = len(math_terms) if math_terms else 0
|
|
53
|
+
if math_term_count:
|
|
54
|
+
germane = min(1.0, math_term_count / 10)
|
|
55
|
+
else:
|
|
56
|
+
germane = self._estimate_germane(text)
|
|
57
|
+
|
|
58
|
+
total = intrinsic * 0.4 + extraneous * 0.3 + germane * 0.3
|
|
59
|
+
|
|
60
|
+
return {
|
|
61
|
+
"intrinsic_cognitive_load": round(intrinsic, 3),
|
|
62
|
+
"extraneous_cognitive_load": round(extraneous, 3),
|
|
63
|
+
"germane_cognitive_load": round(germane, 3),
|
|
64
|
+
"total_cognitive_load": round(total, 3),
|
|
65
|
+
"numeric_elements": len(numbers),
|
|
66
|
+
"variable_count": len(variables),
|
|
67
|
+
"operation_count": operations,
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
def _estimate_extraneous(self, text: str) -> float:
|
|
71
|
+
words = text.split()
|
|
72
|
+
word_count = len(words)
|
|
73
|
+
if not word_count:
|
|
74
|
+
return 0.0
|
|
75
|
+
|
|
76
|
+
avg_word_length = sum(len(w) for w in words) / word_count
|
|
77
|
+
sentences = re.split(r"[.!?]+", text)
|
|
78
|
+
sentences = [s for s in sentences if s.strip()]
|
|
79
|
+
avg_sentence_length = word_count / max(len(sentences), 1)
|
|
80
|
+
|
|
81
|
+
estimated_grade = (avg_word_length * 1.5) + (avg_sentence_length * 0.3) - 3
|
|
82
|
+
estimated_grade = max(1, min(16, estimated_grade))
|
|
83
|
+
return min(1.0, estimated_grade / 12)
|
|
84
|
+
|
|
85
|
+
def _estimate_germane(self, text: str) -> float:
|
|
86
|
+
math_keywords = {
|
|
87
|
+
"add", "subtract", "multiply", "divide", "sum", "difference",
|
|
88
|
+
"product", "quotient", "fraction", "decimal", "percent",
|
|
89
|
+
"equation", "variable", "solve", "function", "graph",
|
|
90
|
+
"area", "perimeter", "volume", "angle", "triangle", "circle",
|
|
91
|
+
"mean", "median", "mode", "probability", "ratio", "proportion",
|
|
92
|
+
}
|
|
93
|
+
text_lower = text.lower()
|
|
94
|
+
found = sum(1 for term in math_keywords if term in text_lower)
|
|
95
|
+
return min(1.0, found / 10) if found else 0.3
|
|
96
|
+
|
|
97
|
+
def _empty_estimate(self) -> Dict[str, Any]:
|
|
98
|
+
return {
|
|
99
|
+
"intrinsic_cognitive_load": 0.0,
|
|
100
|
+
"extraneous_cognitive_load": 0.0,
|
|
101
|
+
"germane_cognitive_load": 0.0,
|
|
102
|
+
"total_cognitive_load": 0.0,
|
|
103
|
+
"numeric_elements": 0,
|
|
104
|
+
"variable_count": 0,
|
|
105
|
+
"operation_count": 0,
|
|
106
|
+
}
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
SOURCE: U.S. Department of Education, Institute of Education Sciences,
|
|
2
|
+
National Center for Education Statistics, National Assessment of Educational
|
|
3
|
+
Progress (NAEP), 2017, 2022, and 2024 Mathematics Assessments.
|
|
4
|
+
|
|
5
|
+
Items obtained from the NAEP Questions Tool (https://www.nationsreportcard.gov/nqt/).
|
|
6
|
+
|
|
7
|
+
NAEP released items are in the public domain per the NAEP Questions Tool
|
|
8
|
+
Copyright Policy. This sample dataset contains 5 items selected to
|
|
9
|
+
demonstrate MathiPy's feature extraction capabilities.
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Sample NAEP assessment items for demonstrating MathiPy features."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
DATA_DIR = Path(__file__).parent
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_sample_csv() -> Path:
|
|
10
|
+
"""Return the path to the sample NAEP CSV file."""
|
|
11
|
+
return DATA_DIR / "naep_sample.csv"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_sample_image(item_id: str) -> Path:
|
|
15
|
+
"""Return the path to a sample NAEP item image by item ID (e.g., ``"2024-4M10 #2"``)."""
|
|
16
|
+
if not re.match(r'^[a-zA-Z0-9_\-\s#]+$', item_id):
|
|
17
|
+
raise ValueError(f"Invalid item_id: {item_id}")
|
|
18
|
+
path = DATA_DIR / f"{item_id}.png"
|
|
19
|
+
if not path.resolve().is_relative_to(DATA_DIR.resolve()):
|
|
20
|
+
raise ValueError(f"Invalid item_id: {item_id}")
|
|
21
|
+
return path
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def list_sample_images() -> list:
|
|
25
|
+
"""Return a sorted list of available sample image filenames."""
|
|
26
|
+
return sorted(p.name for p in DATA_DIR.glob("*.png"))
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
item_id,grade,year,difficulty,content,image_file
|
|
2
|
+
2024-4M10 #2,4,2024,Easy,Algebra,2024-4M10 #2.png
|
|
3
|
+
2017-4M1 #4,4,2017,Medium,Number Properties and Operations,2017-4M1 #4.png
|
|
4
|
+
2024-4M13 #2,4,2024,Hard,Measurement,2024-4M13 #2.png
|
|
5
|
+
2022-8M1 #2,8,2022,Easy,Geometry,2022-8M1 #2.png
|
|
6
|
+
2017-8M3 #2,8,2017,Easy,"Data Analysis, Statistics, and Probability",2017-8M3 #2.png
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
"""Mathematical content analysis and domain classification."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import logging
|
|
5
|
+
from collections import Counter
|
|
6
|
+
from typing import Any, Dict, List, Set, Union
|
|
7
|
+
|
|
8
|
+
from mathipy.utils import extract_numbers
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class MathContentAnalyzer:
|
|
14
|
+
"""Analyze math content and classify by Common Core State Standards domain.
|
|
15
|
+
|
|
16
|
+
Detects math patterns (equations, fractions, operations), counts symbols,
|
|
17
|
+
extracts numbers and vocabulary, and classifies the primary math domain
|
|
18
|
+
(arithmetic, algebra, geometry, statistics, calculus, fractions).
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self):
|
|
22
|
+
self._init_patterns()
|
|
23
|
+
self._init_vocabulary()
|
|
24
|
+
|
|
25
|
+
def _init_patterns(self):
|
|
26
|
+
self.patterns = {
|
|
27
|
+
"addition": re.compile(r"\d+\s*\+\s*\d+"),
|
|
28
|
+
"subtraction": re.compile(r"\d+\s*[-−]\s*\d+"),
|
|
29
|
+
"multiplication": re.compile(r"\d+\s*[×*·]\s*\d+"),
|
|
30
|
+
"division": re.compile(r"\d+\s*[÷/]\s*\d+"),
|
|
31
|
+
"variable": re.compile(r"\b[a-zA-Z]\b(?!\w)"),
|
|
32
|
+
"equation": re.compile(r"[^=]+=\s*[^=]+"),
|
|
33
|
+
"inequality": re.compile(r"[^<>=]+\s*[<>≤≥]\s*[^<>=]+"),
|
|
34
|
+
"exponent": re.compile(r"\w+\^[\w\d{}]+|\w+\*\*[\w\d{}]+"),
|
|
35
|
+
"function": re.compile(r"\b[a-zA-Z]+\([^)]+\)"),
|
|
36
|
+
"polynomial": re.compile(r"[a-z]\^?\d*\s*[+\-]\s*[a-z]\^?\d*"),
|
|
37
|
+
"fraction": re.compile(r"\d+/\d+|\\frac\{\d+\}\{\d+\}"),
|
|
38
|
+
"decimal": re.compile(r"\d+\.\d+"),
|
|
39
|
+
"percentage": re.compile(r"\d+\.?\d*%"),
|
|
40
|
+
"ratio": re.compile(r"\d+:\d+"),
|
|
41
|
+
"scientific_notation": re.compile(r"\d+\.?\d*\s*[×x]\s*10\^[-]?\d+"),
|
|
42
|
+
"derivative": re.compile(r"d/dx|f'|\\frac\{d\}\{dx\}"),
|
|
43
|
+
"integral": re.compile(r"∫|\\int"),
|
|
44
|
+
"limit": re.compile(r"\\lim|lim_"),
|
|
45
|
+
"summation": re.compile(r"∑|\\sum"),
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
self.symbols = {
|
|
49
|
+
"+": "addition", "-": "subtraction", "×": "multiplication",
|
|
50
|
+
"*": "multiplication", "·": "multiplication", "÷": "division",
|
|
51
|
+
"/": "division", "=": "equals", "<": "less_than",
|
|
52
|
+
">": "greater_than", "≤": "less_equal", "≥": "greater_equal",
|
|
53
|
+
"≠": "not_equal", "≈": "approximately", "√": "square_root",
|
|
54
|
+
"∑": "summation", "∫": "integral", "π": "pi", "∞": "infinity",
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
def _init_vocabulary(self):
|
|
58
|
+
self.domains = {
|
|
59
|
+
"arithmetic": {
|
|
60
|
+
"add", "subtract", "multiply", "divide", "sum", "difference",
|
|
61
|
+
"product", "quotient", "remainder", "factor", "multiple",
|
|
62
|
+
"even", "odd", "prime", "composite", "digit", "place value",
|
|
63
|
+
},
|
|
64
|
+
"algebra": {
|
|
65
|
+
"variable", "coefficient", "term", "expression", "equation",
|
|
66
|
+
"inequality", "solve", "simplify", "factor", "polynomial",
|
|
67
|
+
"linear", "quadratic", "function", "slope", "intercept",
|
|
68
|
+
},
|
|
69
|
+
"geometry": {
|
|
70
|
+
"point", "line", "ray", "segment", "angle", "triangle",
|
|
71
|
+
"rectangle", "square", "circle", "polygon", "area",
|
|
72
|
+
"perimeter", "volume", "parallel", "perpendicular", "congruent",
|
|
73
|
+
},
|
|
74
|
+
"statistics": {
|
|
75
|
+
"mean", "median", "mode", "range", "data", "graph", "chart",
|
|
76
|
+
"probability", "outcome", "sample", "population", "distribution",
|
|
77
|
+
"standard deviation", "variance", "correlation",
|
|
78
|
+
},
|
|
79
|
+
"calculus": {
|
|
80
|
+
"limit", "derivative", "integral", "differentiate", "integrate",
|
|
81
|
+
"continuous", "rate of change", "maximum", "minimum",
|
|
82
|
+
"optimization", "series", "convergence",
|
|
83
|
+
},
|
|
84
|
+
"fractions": {
|
|
85
|
+
"fraction", "numerator", "denominator", "mixed number",
|
|
86
|
+
"improper", "equivalent", "simplify", "common denominator",
|
|
87
|
+
"decimal", "percent", "ratio", "proportion",
|
|
88
|
+
},
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
self.all_terms: Set[str] = set()
|
|
92
|
+
for terms in self.domains.values():
|
|
93
|
+
self.all_terms.update(terms)
|
|
94
|
+
|
|
95
|
+
def analyze(self, text: str) -> Dict[str, Any]:
|
|
96
|
+
"""Analyze math content in the given text.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
text: Input text to analyze.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
Dictionary with ``pattern_matches``, ``symbol_counts``, ``numbers``,
|
|
103
|
+
``vocabulary``, ``domain_classification``, and ``math_density``.
|
|
104
|
+
"""
|
|
105
|
+
if not text or not text.strip():
|
|
106
|
+
return self._empty_analysis()
|
|
107
|
+
|
|
108
|
+
text_lower = text.lower()
|
|
109
|
+
pattern_matches = self._match_patterns(text)
|
|
110
|
+
symbol_counts = self._count_symbols(text)
|
|
111
|
+
numbers = extract_numbers(text)
|
|
112
|
+
term_matches = self._match_vocabulary(text_lower)
|
|
113
|
+
domain = self._classify_domain(text_lower, pattern_matches, term_matches)
|
|
114
|
+
|
|
115
|
+
word_count = len(text.split())
|
|
116
|
+
return {
|
|
117
|
+
"pattern_matches": pattern_matches,
|
|
118
|
+
"symbol_counts": symbol_counts,
|
|
119
|
+
"total_math_symbols": sum(symbol_counts.values()),
|
|
120
|
+
"unique_symbol_types": len(symbol_counts),
|
|
121
|
+
"numbers": {
|
|
122
|
+
"count": len(numbers),
|
|
123
|
+
"values": numbers[:20],
|
|
124
|
+
"range": max(numbers) - min(numbers) if numbers else 0,
|
|
125
|
+
"has_negative": any(n < 0 for n in numbers),
|
|
126
|
+
"has_decimal": any(isinstance(n, float) and n != int(n) for n in numbers),
|
|
127
|
+
},
|
|
128
|
+
"vocabulary": {
|
|
129
|
+
"math_terms": list(term_matches.keys()),
|
|
130
|
+
"term_count": sum(term_matches.values()),
|
|
131
|
+
"unique_terms": len(term_matches),
|
|
132
|
+
},
|
|
133
|
+
"domain_classification": domain,
|
|
134
|
+
"math_density": sum(pattern_matches.values()) / word_count if word_count else 0,
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
def _match_patterns(self, text: str) -> Dict[str, int]:
|
|
138
|
+
matches = {}
|
|
139
|
+
for name, pattern in self.patterns.items():
|
|
140
|
+
found = pattern.findall(text)
|
|
141
|
+
if found:
|
|
142
|
+
matches[name] = len(found)
|
|
143
|
+
return matches
|
|
144
|
+
|
|
145
|
+
def _count_symbols(self, text: str) -> Dict[str, int]:
|
|
146
|
+
counts = Counter()
|
|
147
|
+
for char in text:
|
|
148
|
+
if char in self.symbols:
|
|
149
|
+
counts[self.symbols[char]] += 1
|
|
150
|
+
return dict(counts)
|
|
151
|
+
|
|
152
|
+
def _match_vocabulary(self, text: str) -> Dict[str, int]:
|
|
153
|
+
matches = {}
|
|
154
|
+
for term in self.all_terms:
|
|
155
|
+
count = text.count(term)
|
|
156
|
+
if count > 0:
|
|
157
|
+
matches[term] = count
|
|
158
|
+
return matches
|
|
159
|
+
|
|
160
|
+
def _classify_domain(
|
|
161
|
+
self,
|
|
162
|
+
text: str,
|
|
163
|
+
patterns: Dict[str, int],
|
|
164
|
+
terms: Dict[str, int],
|
|
165
|
+
) -> Dict[str, Any]:
|
|
166
|
+
domain_scores: Dict[str, float] = {}
|
|
167
|
+
|
|
168
|
+
for domain, vocab in self.domains.items():
|
|
169
|
+
score = 0
|
|
170
|
+
for term in vocab:
|
|
171
|
+
if term in terms:
|
|
172
|
+
score += terms[term]
|
|
173
|
+
domain_scores[domain] = score
|
|
174
|
+
|
|
175
|
+
if patterns.get("derivative") or patterns.get("integral"):
|
|
176
|
+
domain_scores["calculus"] = domain_scores.get("calculus", 0) + 5
|
|
177
|
+
|
|
178
|
+
if patterns.get("fraction"):
|
|
179
|
+
domain_scores["fractions"] = domain_scores.get("fractions", 0) + 3
|
|
180
|
+
|
|
181
|
+
if patterns.get("equation") or patterns.get("variable"):
|
|
182
|
+
domain_scores["algebra"] = domain_scores.get("algebra", 0) + 2
|
|
183
|
+
|
|
184
|
+
primary = max(domain_scores, key=domain_scores.get) if domain_scores else "unknown"
|
|
185
|
+
total = sum(domain_scores.values()) or 1
|
|
186
|
+
|
|
187
|
+
return {
|
|
188
|
+
"primary": primary,
|
|
189
|
+
"confidence": domain_scores.get(primary, 0) / total,
|
|
190
|
+
"scores": domain_scores,
|
|
191
|
+
"secondary": sorted(
|
|
192
|
+
domain_scores.keys(),
|
|
193
|
+
key=lambda k: domain_scores[k],
|
|
194
|
+
reverse=True,
|
|
195
|
+
)[1:3] if len(domain_scores) > 1 else [],
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
def _empty_analysis(self) -> Dict[str, Any]:
|
|
199
|
+
return {
|
|
200
|
+
"pattern_matches": {},
|
|
201
|
+
"symbol_counts": {},
|
|
202
|
+
"total_math_symbols": 0,
|
|
203
|
+
"unique_symbol_types": 0,
|
|
204
|
+
"numbers": {
|
|
205
|
+
"count": 0, "values": [], "range": 0,
|
|
206
|
+
"has_negative": False, "has_decimal": False,
|
|
207
|
+
},
|
|
208
|
+
"vocabulary": {"math_terms": [], "term_count": 0, "unique_terms": 0},
|
|
209
|
+
"domain_classification": {
|
|
210
|
+
"primary": "unknown", "confidence": 0, "scores": {}, "secondary": [],
|
|
211
|
+
},
|
|
212
|
+
"math_density": 0,
|
|
213
|
+
}
|