crossrs 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crossrs-0.1.0/LICENSE +21 -0
- crossrs-0.1.0/PKG-INFO +90 -0
- crossrs-0.1.0/README.md +66 -0
- crossrs-0.1.0/pyproject.toml +6 -0
- crossrs-0.1.0/setup.cfg +42 -0
- crossrs-0.1.0/src/crossrs/__init__.py +1 -0
- crossrs-0.1.0/src/crossrs/app/__init__.py +6 -0
- crossrs-0.1.0/src/crossrs/app/app.py +7 -0
- crossrs-0.1.0/src/crossrs/app/commands/__init__.py +13 -0
- crossrs-0.1.0/src/crossrs/app/commands/delete.py +40 -0
- crossrs-0.1.0/src/crossrs/app/commands/init.py +118 -0
- crossrs-0.1.0/src/crossrs/app/commands/path.py +26 -0
- crossrs-0.1.0/src/crossrs/app/commands/stats.py +138 -0
- crossrs-0.1.0/src/crossrs/app/commands/study/__init__.py +300 -0
- crossrs-0.1.0/src/crossrs/app/commands/study/chooser.py +165 -0
- crossrs-0.1.0/src/crossrs/app/commands/study/evaluator.py +162 -0
- crossrs-0.1.0/src/crossrs/app/commands/study/explainer.py +43 -0
- crossrs-0.1.0/src/crossrs/app/commands/study/interaction.py +80 -0
- crossrs-0.1.0/src/crossrs/app/commands/study/updater.py +103 -0
- crossrs-0.1.0/src/crossrs/crossrs.py +4 -0
- crossrs-0.1.0/src/crossrs/db/__init__.py +51 -0
- crossrs-0.1.0/src/crossrs/db/base.py +9 -0
- crossrs-0.1.0/src/crossrs/db/models.py +113 -0
- crossrs-0.1.0/src/crossrs/diff/__init__.py +40 -0
- crossrs-0.1.0/src/crossrs/diff/tokenizer.py +41 -0
- crossrs-0.1.0/src/crossrs/utils/__init__.py +1 -0
- crossrs-0.1.0/src/crossrs/utils/console.py +12 -0
- crossrs-0.1.0/src/crossrs/utils/paths.py +31 -0
- crossrs-0.1.0/src/crossrs/utils/strings.py +9 -0
- crossrs-0.1.0/src/crossrs/utils/time.py +9 -0
- crossrs-0.1.0/src/crossrs/utils/typer.py +13 -0
- crossrs-0.1.0/src/crossrs.egg-info/PKG-INFO +90 -0
- crossrs-0.1.0/src/crossrs.egg-info/SOURCES.txt +37 -0
- crossrs-0.1.0/src/crossrs.egg-info/dependency_links.txt +1 -0
- crossrs-0.1.0/src/crossrs.egg-info/entry_points.txt +2 -0
- crossrs-0.1.0/src/crossrs.egg-info/requires.txt +8 -0
- crossrs-0.1.0/src/crossrs.egg-info/top_level.txt +1 -0
- crossrs-0.1.0/tests/test_crossrs.py +411 -0
crossrs-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Danylo Mysak
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
crossrs-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: crossrs
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Command-line tool for learning foreign languages through reverse translation of word-based sentences
|
|
5
|
+
Home-page: https://github.com/danmysak/crossrs
|
|
6
|
+
Author: Danylo Mysak
|
|
7
|
+
Author-email: danmysak@gmail.com
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/danmysak/crossrs/issues
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Requires-Python: >=3.13
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
Requires-Dist: openai
|
|
16
|
+
Requires-Dist: prompt_toolkit
|
|
17
|
+
Requires-Dist: pydantic
|
|
18
|
+
Requires-Dist: regex
|
|
19
|
+
Requires-Dist: rich
|
|
20
|
+
Requires-Dist: SQLAlchemy
|
|
21
|
+
Requires-Dist: tqdm
|
|
22
|
+
Requires-Dist: typer
|
|
23
|
+
Dynamic: license-file
|
|
24
|
+
|
|
25
|
+
# CrossRS
|
|
26
|
+
|
|
27
|
+
CrossRS is a command-line tool for improving language **production** skills through reverse translation exercises. Given a corpus in your target language, CrossRS translates sentences into a source language you already know and asks you to translate them back, reinforcing vocabulary and grammar through word-based spaced repetition. Because CrossRS uses GPT under the hood, you must have a paid [OpenAI account](https://platform.openai.com/) and an [API key](https://platform.openai.com/api-keys) to run it.
|
|
28
|
+
|
|
29
|
+
## How It Works
|
|
30
|
+
|
|
31
|
+
CrossRS focuses on **words** sorted by their frequency in the corpus. You learn the most common ones first. Each study round:
|
|
32
|
+
|
|
33
|
+
1. CrossRS picks a sentence containing the next word to learn.
|
|
34
|
+
2. The sentence is translated into your source language and shown to you.
|
|
35
|
+
3. You translate it back into the target language.
|
|
36
|
+
4. CrossRS evaluates your translation and provides feedback — either a ✅ confirmation or a ❌ with a highlighted diff showing the minimal corrections needed.
|
|
37
|
+
|
|
38
|
+
Sentences you translate correctly on the first try are scheduled for a single review in **29 days 20 hours**. Otherwise, they enter a spaced-repetition queue with reviews at **20 hours**, **6 days 20 hours**, and **29 days 20 hours** before being marked as learned.
|
|
39
|
+
|
|
40
|
+
## Installation
|
|
41
|
+
|
|
42
|
+
Install [Python](https://www.python.org/downloads/) **3.13 or later** and [pipx](https://pipx.pypa.io/stable/installation/), then run:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pipx install crossrs # install
|
|
46
|
+
pipx upgrade crossrs # upgrade
|
|
47
|
+
pipx uninstall crossrs # uninstall
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Initialize a New Language
|
|
51
|
+
|
|
52
|
+
Prepare a plain-text file that contains **one sentence per line** in the language you want to learn. For example, you can download a monolingual corpus from [OPUS](https://opus.nlpl.eu/). Then run:
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
crossrs init <target-lang> <corpus>
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
`<target-lang>` is a language code (e.g., `de`, `fr`, `uk`), and `<corpus>` is the path to the corpus file.
|
|
59
|
+
|
|
60
|
+
## Study a Language
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
crossrs study <target-lang> <source-lang> [--threshold T] [--model <GPT_MODEL>] [--api-key <OPENAI_KEY>]
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
* **`<target-lang>`** — the language code you initialized earlier.
|
|
67
|
+
* **`<source-lang>`** — the language you want sentences translated into (e.g., `en`).
|
|
68
|
+
* **`--threshold` / `-t`** — the learnedness threshold for words (default: 3). A word is considered fully learned once it has appeared in this many learned sentences.
|
|
69
|
+
* **`--model`** — the GPT model to use for translation and evaluation.
|
|
70
|
+
* **`--api-key`** — your OpenAI API key.
|
|
71
|
+
|
|
72
|
+
Instead of passing `--model` and `--api-key` each time, you can set the environment variables `CROSSRS_MODEL` and `CROSSRS_API_KEY`.
|
|
73
|
+
|
|
74
|
+
## View Your Progress
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
crossrs stats <target-lang> [--threshold T]
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Displays:
|
|
81
|
+
- **Sentences**: learned + in queue / total
|
|
82
|
+
- **Words**: learned / total, with word-level coverage
|
|
83
|
+
- **Total rounds**: the number of translation attempts so far
|
|
84
|
+
|
|
85
|
+
## Other Commands
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
crossrs path <target-lang> # show the path to the language data file
|
|
89
|
+
crossrs delete <target-lang> [--force] # delete the language data file; use --force to skip the confirmation prompt
|
|
90
|
+
```
|
crossrs-0.1.0/README.md
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# CrossRS
|
|
2
|
+
|
|
3
|
+
CrossRS is a command-line tool for improving language **production** skills through reverse translation exercises. Given a corpus in your target language, CrossRS translates sentences into a source language you already know and asks you to translate them back, reinforcing vocabulary and grammar through word-based spaced repetition. Because CrossRS uses GPT under the hood, you must have a paid [OpenAI account](https://platform.openai.com/) and an [API key](https://platform.openai.com/api-keys) to run it.
|
|
4
|
+
|
|
5
|
+
## How It Works
|
|
6
|
+
|
|
7
|
+
CrossRS focuses on **words** sorted by their frequency in the corpus. You learn the most common ones first. Each study round:
|
|
8
|
+
|
|
9
|
+
1. CrossRS picks a sentence containing the next word to learn.
|
|
10
|
+
2. The sentence is translated into your source language and shown to you.
|
|
11
|
+
3. You translate it back into the target language.
|
|
12
|
+
4. CrossRS evaluates your translation and provides feedback — either a ✅ confirmation or a ❌ with a highlighted diff showing the minimal corrections needed.
|
|
13
|
+
|
|
14
|
+
Sentences you translate correctly on the first try are scheduled for a single review in **29 days 20 hours**. Otherwise, they enter a spaced-repetition queue with reviews at **20 hours**, **6 days 20 hours**, and **29 days 20 hours** before being marked as learned.
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
Install [Python](https://www.python.org/downloads/) **3.13 or later** and [pipx](https://pipx.pypa.io/stable/installation/), then run:
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pipx install crossrs # install
|
|
22
|
+
pipx upgrade crossrs # upgrade
|
|
23
|
+
pipx uninstall crossrs # uninstall
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Initialize a New Language
|
|
27
|
+
|
|
28
|
+
Prepare a plain-text file that contains **one sentence per line** in the language you want to learn. For example, you can download a monolingual corpus from [OPUS](https://opus.nlpl.eu/). Then run:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
crossrs init <target-lang> <corpus>
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
`<target-lang>` is a language code (e.g., `de`, `fr`, `uk`), and `<corpus>` is the path to the corpus file.
|
|
35
|
+
|
|
36
|
+
## Study a Language
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
crossrs study <target-lang> <source-lang> [--threshold T] [--model <GPT_MODEL>] [--api-key <OPENAI_KEY>]
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
* **`<target-lang>`** — the language code you initialized earlier.
|
|
43
|
+
* **`<source-lang>`** — the language you want sentences translated into (e.g., `en`).
|
|
44
|
+
* **`--threshold` / `-t`** — the learnedness threshold for words (default: 3). A word is considered fully learned once it has appeared in this many learned sentences.
|
|
45
|
+
* **`--model`** — the GPT model to use for translation and evaluation.
|
|
46
|
+
* **`--api-key`** — your OpenAI API key.
|
|
47
|
+
|
|
48
|
+
Instead of passing `--model` and `--api-key` each time, you can set the environment variables `CROSSRS_MODEL` and `CROSSRS_API_KEY`.
|
|
49
|
+
|
|
50
|
+
## View Your Progress
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
crossrs stats <target-lang> [--threshold T]
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Displays:
|
|
57
|
+
- **Sentences**: learned + in queue / total
|
|
58
|
+
- **Words**: learned / total, with word-level coverage
|
|
59
|
+
- **Total rounds**: the number of translation attempts so far
|
|
60
|
+
|
|
61
|
+
## Other Commands
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
crossrs path <target-lang> # show the path to the language data file
|
|
65
|
+
crossrs delete <target-lang> [--force] # delete the language data file; use --force to skip the confirmation prompt
|
|
66
|
+
```
|
crossrs-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
[metadata]
|
|
2
|
+
name = crossrs
|
|
3
|
+
version = 0.1.0
|
|
4
|
+
author = Danylo Mysak
|
|
5
|
+
author_email = danmysak@gmail.com
|
|
6
|
+
description = Command-line tool for learning foreign languages through reverse translation of word-based sentences
|
|
7
|
+
long_description = file: README.md
|
|
8
|
+
long_description_content_type = text/markdown
|
|
9
|
+
url = https://github.com/danmysak/crossrs
|
|
10
|
+
project_urls =
|
|
11
|
+
Bug Tracker = https://github.com/danmysak/crossrs/issues
|
|
12
|
+
classifiers =
|
|
13
|
+
Programming Language :: Python :: 3
|
|
14
|
+
License :: OSI Approved :: MIT License
|
|
15
|
+
Operating System :: OS Independent
|
|
16
|
+
|
|
17
|
+
[options]
|
|
18
|
+
package_dir =
|
|
19
|
+
=src
|
|
20
|
+
packages = find:
|
|
21
|
+
python_requires = >=3.13
|
|
22
|
+
install_requires =
|
|
23
|
+
openai
|
|
24
|
+
prompt_toolkit
|
|
25
|
+
pydantic
|
|
26
|
+
regex
|
|
27
|
+
rich
|
|
28
|
+
SQLAlchemy
|
|
29
|
+
tqdm
|
|
30
|
+
typer
|
|
31
|
+
|
|
32
|
+
[options.entry_points]
|
|
33
|
+
console_scripts =
|
|
34
|
+
crossrs = crossrs.crossrs:app
|
|
35
|
+
|
|
36
|
+
[options.packages.find]
|
|
37
|
+
where = src
|
|
38
|
+
|
|
39
|
+
[egg_info]
|
|
40
|
+
tag_build =
|
|
41
|
+
tag_date = 0
|
|
42
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from typing import Annotated
|
|
2
|
+
|
|
3
|
+
from typer import Argument, Option
|
|
4
|
+
|
|
5
|
+
from crossrs.app.app import app
|
|
6
|
+
from crossrs.db import get_path
|
|
7
|
+
from crossrs.utils.typer import typer_raise
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
'delete',
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@app.command()
|
|
15
|
+
def delete(
|
|
16
|
+
language: Annotated[
|
|
17
|
+
str,
|
|
18
|
+
Argument(help='Target language code whose data should be removed.'),
|
|
19
|
+
],
|
|
20
|
+
force: Annotated[
|
|
21
|
+
bool | None,
|
|
22
|
+
Option(
|
|
23
|
+
'--force',
|
|
24
|
+
'-f',
|
|
25
|
+
help='Skip the confirmation prompt and delete immediately.',
|
|
26
|
+
),
|
|
27
|
+
] = False,
|
|
28
|
+
) -> None:
|
|
29
|
+
"""Delete all stored data for `language`."""
|
|
30
|
+
path = get_path(language)
|
|
31
|
+
if not path.exists():
|
|
32
|
+
typer_raise(f'Language "{language}" is not initialized.')
|
|
33
|
+
|
|
34
|
+
if not force and not input(
|
|
35
|
+
f'Are you sure you want to delete all data for "{language}"? (y/N): ',
|
|
36
|
+
).lower().startswith('y'):
|
|
37
|
+
typer_raise('Operation canceled.')
|
|
38
|
+
|
|
39
|
+
path.unlink(missing_ok=True)
|
|
40
|
+
print(f'Deleted all data for "{language}".')
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from collections import Counter
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Annotated, Generator
|
|
5
|
+
|
|
6
|
+
from sqlalchemy import text
|
|
7
|
+
from tqdm import tqdm
|
|
8
|
+
from typer import Argument
|
|
9
|
+
|
|
10
|
+
from crossrs.app.app import app
|
|
11
|
+
from crossrs.db import get_session, Session
|
|
12
|
+
from crossrs.db.models import Metadata, Word, Sentence, SentenceWord
|
|
13
|
+
from crossrs.diff.tokenizer import tokenize
|
|
14
|
+
from crossrs.utils.typer import typer_raise
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
'init',
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_sentences(corpus: Path) -> Generator[str, None, None]:
|
|
22
|
+
"""Yield sentences from the corpus file."""
|
|
23
|
+
with corpus.open('r', encoding='utf-8') as file:
|
|
24
|
+
for line in file:
|
|
25
|
+
if sentence := line.strip():
|
|
26
|
+
yield sentence
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def extract_tokens(sentence: str) -> list[str]:
|
|
30
|
+
"""Extract normalized tokens from a sentence."""
|
|
31
|
+
return [token.normalized for token in tokenize(sentence)]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def extract_words(tokens: list[str]) -> list[str]:
|
|
35
|
+
"""Extract words from a list of tokens."""
|
|
36
|
+
return list(tokens)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def process_corpus(corpus: Path) -> tuple[dict[str, set[str]], Counter[str]]:
|
|
40
|
+
"""Process the corpus and return unique words by sentence and word frequencies."""
|
|
41
|
+
words_by_sentence: dict[str, set[str]] = {}
|
|
42
|
+
word_frequencies: Counter[str] = Counter()
|
|
43
|
+
|
|
44
|
+
for sentence in tqdm(get_sentences(corpus), desc='Processing sentences'):
|
|
45
|
+
if sentence not in words_by_sentence:
|
|
46
|
+
tokens = extract_tokens(sentence)
|
|
47
|
+
if tokens:
|
|
48
|
+
words = extract_words(tokens)
|
|
49
|
+
words_by_sentence[sentence] = set(words)
|
|
50
|
+
word_frequencies.update(words)
|
|
51
|
+
|
|
52
|
+
return words_by_sentence, word_frequencies
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def add_words(session: Session, frequencies: Counter[str]) -> dict[str, int]:
|
|
56
|
+
"""Add words to the database sorted by frequency and return a mapping of words to their IDs."""
|
|
57
|
+
ids_by_word: dict[str, int] = {}
|
|
58
|
+
for word, occurrences in tqdm(frequencies.most_common(), desc='Adding words'):
|
|
59
|
+
word_obj = Word(
|
|
60
|
+
word=word,
|
|
61
|
+
occurrences=occurrences,
|
|
62
|
+
learnedness=0,
|
|
63
|
+
)
|
|
64
|
+
session.add(word_obj)
|
|
65
|
+
session.flush()
|
|
66
|
+
ids_by_word[word] = word_obj.id
|
|
67
|
+
return ids_by_word
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def add_sentences(session: Session, words_by_sentence: dict[str, set[str]],
|
|
71
|
+
ids_by_word: dict[str, int]) -> None:
|
|
72
|
+
"""Add sentences to the database with their word associations."""
|
|
73
|
+
for sentence_text, words in tqdm(words_by_sentence.items(), desc='Adding sentences'):
|
|
74
|
+
sentence_obj = Sentence(sentence=sentence_text)
|
|
75
|
+
session.add(sentence_obj)
|
|
76
|
+
session.flush()
|
|
77
|
+
for word in words:
|
|
78
|
+
session.add(SentenceWord(
|
|
79
|
+
sentence_id=sentence_obj.id,
|
|
80
|
+
word_id=ids_by_word[word],
|
|
81
|
+
))
|
|
82
|
+
session.flush()
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@app.command()
|
|
86
|
+
def init(
|
|
87
|
+
language: Annotated[
|
|
88
|
+
str,
|
|
89
|
+
Argument(help='Target language code (e.g., "de", "fr", "uk").'),
|
|
90
|
+
],
|
|
91
|
+
corpus: Annotated[
|
|
92
|
+
Path,
|
|
93
|
+
Argument(
|
|
94
|
+
dir_okay=False,
|
|
95
|
+
exists=True,
|
|
96
|
+
readable=True,
|
|
97
|
+
resolve_path=True,
|
|
98
|
+
help='Plain-text file containing one sentence per line.',
|
|
99
|
+
),
|
|
100
|
+
],
|
|
101
|
+
) -> None:
|
|
102
|
+
"""Initialize CrossRS for a new target language."""
|
|
103
|
+
with get_session(language) as session:
|
|
104
|
+
if session.query(Sentence).limit(1).first():
|
|
105
|
+
typer_raise(f'CrossRS is already initialized for language "{language}".')
|
|
106
|
+
words_by_sentence, word_frequencies = process_corpus(corpus)
|
|
107
|
+
if not word_frequencies:
|
|
108
|
+
typer_raise('No valid sentences found in the corpus.')
|
|
109
|
+
ids_by_word = add_words(session, word_frequencies)
|
|
110
|
+
add_sentences(session, words_by_sentence, ids_by_word)
|
|
111
|
+
session.add(Metadata(id=1, total_rounds=0))
|
|
112
|
+
print('Committing changes to the database...')
|
|
113
|
+
session.commit()
|
|
114
|
+
print('Optimizing the database...')
|
|
115
|
+
session.execute(text('vacuum'))
|
|
116
|
+
print(f'Initialized CrossRS for language "{language}" '
|
|
117
|
+
f'with {len(words_by_sentence)} unique sentences '
|
|
118
|
+
f'and {len(word_frequencies)} unique words.')
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from typing import Annotated
|
|
2
|
+
|
|
3
|
+
from typer import Argument
|
|
4
|
+
|
|
5
|
+
from crossrs.app.app import app
|
|
6
|
+
from crossrs.db import get_path
|
|
7
|
+
from crossrs.utils.typer import typer_raise
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
'path',
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@app.command()
|
|
15
|
+
def path(
|
|
16
|
+
language: Annotated[
|
|
17
|
+
str,
|
|
18
|
+
Argument(help='Target language code whose data file should be printed.'),
|
|
19
|
+
],
|
|
20
|
+
) -> None:
|
|
21
|
+
"""Print the absolute path to CrossRS's data file for `language`."""
|
|
22
|
+
language_path = get_path(language)
|
|
23
|
+
if language_path.exists():
|
|
24
|
+
print(language_path.absolute())
|
|
25
|
+
else:
|
|
26
|
+
typer_raise(f'Language "{language}" is not initialized.')
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Annotated
|
|
3
|
+
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from rich.console import Console
|
|
6
|
+
from rich.text import Text
|
|
7
|
+
from sqlalchemy import func, case
|
|
8
|
+
from typer import Argument, Option
|
|
9
|
+
|
|
10
|
+
from crossrs.app.app import app
|
|
11
|
+
from crossrs.db import get_session, Session
|
|
12
|
+
from crossrs.db.models import Metadata, Word, Sentence
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
'stats',
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
DEFAULT_THRESHOLD = 3
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class WordStatsItem:
|
|
23
|
+
learned: int = 0
|
|
24
|
+
total: int = 0
|
|
25
|
+
learned_occurrences: int = 0
|
|
26
|
+
total_occurrences: int = 0
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class SentenceStatsData:
|
|
31
|
+
learned: int = 0
|
|
32
|
+
in_queue: int = 0
|
|
33
|
+
total: int = 0
|
|
34
|
+
total_rounds: int = 0
|
|
35
|
+
targeted_words: int = 0
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def compute_word_stats(session: Session, threshold: int) -> WordStatsItem:
|
|
39
|
+
"""Compute word statistics using SQL aggregation."""
|
|
40
|
+
row = session.query(
|
|
41
|
+
func.count(Word.id),
|
|
42
|
+
func.sum(Word.occurrences),
|
|
43
|
+
func.sum(case((Word.learnedness >= threshold, 1), else_=0)),
|
|
44
|
+
func.sum(case((Word.learnedness >= threshold, Word.occurrences), else_=0)),
|
|
45
|
+
).one()
|
|
46
|
+
|
|
47
|
+
return WordStatsItem(
|
|
48
|
+
learned=int(row[2] or 0),
|
|
49
|
+
total=int(row[0] or 0),
|
|
50
|
+
learned_occurrences=int(row[3] or 0),
|
|
51
|
+
total_occurrences=int(row[1] or 0),
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def compute_sentence_stats(session: Session, threshold: int) -> SentenceStatsData:
|
|
56
|
+
"""Compute sentence statistics using SQL aggregation."""
|
|
57
|
+
row = session.query(
|
|
58
|
+
func.count(Sentence.id),
|
|
59
|
+
func.sum(case((Sentence.status == 2, 1), else_=0)),
|
|
60
|
+
func.sum(case((Sentence.status == 1, 1), else_=0)),
|
|
61
|
+
).one()
|
|
62
|
+
|
|
63
|
+
meta = session.get(Metadata, 1)
|
|
64
|
+
total_rounds = meta.total_rounds if meta else 0
|
|
65
|
+
|
|
66
|
+
targeted_words = session.query(
|
|
67
|
+
func.count(func.distinct(Sentence.target_word_id)),
|
|
68
|
+
).filter(
|
|
69
|
+
Sentence.status == 1,
|
|
70
|
+
Sentence.target_word_id.is_not(None),
|
|
71
|
+
).join(Word, Sentence.target_word_id == Word.id).filter(
|
|
72
|
+
Word.learnedness < threshold,
|
|
73
|
+
).scalar()
|
|
74
|
+
|
|
75
|
+
return SentenceStatsData(
|
|
76
|
+
total=int(row[0]),
|
|
77
|
+
total_rounds=total_rounds,
|
|
78
|
+
learned=int(row[1] or 0),
|
|
79
|
+
in_queue=int(row[2] or 0),
|
|
80
|
+
targeted_words=int(targeted_words or 0),
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def format_section_title(title: str) -> Text:
|
|
85
|
+
return Text(title, style='bold underline')
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def format_stats_label(title: str) -> Text:
|
|
89
|
+
return Text(f'{title}:', style='bold')
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
@app.command()
|
|
93
|
+
def stats(
|
|
94
|
+
language: Annotated[
|
|
95
|
+
str,
|
|
96
|
+
Argument(help='Target language code to show statistics for.'),
|
|
97
|
+
],
|
|
98
|
+
threshold: Annotated[
|
|
99
|
+
int,
|
|
100
|
+
Option(
|
|
101
|
+
'--threshold', '-t',
|
|
102
|
+
help='Learnedness threshold for words to be considered fully learned.',
|
|
103
|
+
),
|
|
104
|
+
] = DEFAULT_THRESHOLD,
|
|
105
|
+
) -> None:
|
|
106
|
+
"""Display study statistics for the given language."""
|
|
107
|
+
with get_session(language) as session:
|
|
108
|
+
word_stats = compute_word_stats(session, threshold)
|
|
109
|
+
sentence_stats = compute_sentence_stats(session, threshold)
|
|
110
|
+
|
|
111
|
+
console = Console(highlight=False)
|
|
112
|
+
|
|
113
|
+
# Sentence statistics
|
|
114
|
+
console.print(format_section_title('Sentence Statistics'))
|
|
115
|
+
console.print(
|
|
116
|
+
format_stats_label('Sentences'),
|
|
117
|
+
Text(f'{sentence_stats.learned} learned + {sentence_stats.in_queue} in queue '
|
|
118
|
+
f'/ {sentence_stats.total} total'),
|
|
119
|
+
)
|
|
120
|
+
console.print()
|
|
121
|
+
|
|
122
|
+
# Word statistics
|
|
123
|
+
console.print(format_section_title('Word Statistics'))
|
|
124
|
+
text = Text(f'{word_stats.learned} learned / {word_stats.total} total')
|
|
125
|
+
if word_stats.total_occurrences > 0:
|
|
126
|
+
coverage = word_stats.learned_occurrences / word_stats.total_occurrences
|
|
127
|
+
text.append(f' ')
|
|
128
|
+
text.append(Text(f'(coverage: {coverage:.1%})', style='dim'))
|
|
129
|
+
console.print(format_stats_label('Words'), text)
|
|
130
|
+
if sentence_stats.targeted_words > 0:
|
|
131
|
+
console.print(
|
|
132
|
+
format_stats_label('Targeted'),
|
|
133
|
+
Text(f'{sentence_stats.targeted_words} unlearned words in queue'),
|
|
134
|
+
)
|
|
135
|
+
console.print()
|
|
136
|
+
|
|
137
|
+
# Total rounds
|
|
138
|
+
console.print(format_stats_label('Total rounds'), sentence_stats.total_rounds)
|