PyPI - kotobase - Versions diffs - 0.1.0__tar.gz - Mend

kotobase 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

kotobase-0.1.0/LICENSE +21 -0
kotobase-0.1.0/MANIFEST.in +1 -0
kotobase-0.1.0/PKG-INFO +149 -0
kotobase-0.1.0/README.md +123 -0
kotobase-0.1.0/pyproject.toml +42 -0
kotobase-0.1.0/setup.cfg +4 -0
kotobase-0.1.0/src/kotobase/__init__.py +20 -0
kotobase-0.1.0/src/kotobase/api.py +130 -0
kotobase-0.1.0/src/kotobase/cli.py +202 -0
kotobase-0.1.0/src/kotobase/core/__init__.py +2 -0
kotobase-0.1.0/src/kotobase/core/datatypes.py +257 -0
kotobase-0.1.0/src/kotobase/db/__init__.py +3 -0
kotobase-0.1.0/src/kotobase/db/database.py +51 -0
kotobase-0.1.0/src/kotobase/db/models.py +180 -0
kotobase-0.1.0/src/kotobase/db_builder/__init__.py +17 -0
kotobase-0.1.0/src/kotobase/db_builder/build_database.py +254 -0
kotobase-0.1.0/src/kotobase/db_builder/config.py +76 -0
kotobase-0.1.0/src/kotobase/db_builder/data/__init__.py +0 -0
kotobase-0.1.0/src/kotobase/db_builder/data/processed/__init__.py +0 -0
kotobase-0.1.0/src/kotobase/db_builder/data/processed/jlpt/__init__.py +0 -0
kotobase-0.1.0/src/kotobase/db_builder/data/processed/jlpt/grammar_n1.json +73 -0
kotobase-0.1.0/src/kotobase/db_builder/data/processed/jlpt/grammar_n2.json +65 -0
kotobase-0.1.0/src/kotobase/db_builder/data/processed/jlpt/grammar_n3.json +65 -0
kotobase-0.1.0/src/kotobase/db_builder/data/processed/jlpt/grammar_n4.json +52 -0
kotobase-0.1.0/src/kotobase/db_builder/data/processed/jlpt/grammar_n5.json +42 -0
kotobase-0.1.0/src/kotobase/db_builder/data/processed/jlpt/kanji_n1.json +7688 -0
kotobase-0.1.0/src/kotobase/db_builder/data/processed/jlpt/kanji_n2.json +2438 -0
kotobase-0.1.0/src/kotobase/db_builder/data/processed/jlpt/kanji_n3.json +2774 -0
kotobase-0.1.0/src/kotobase/db_builder/data/processed/jlpt/kanji_n4.json +1082 -0
kotobase-0.1.0/src/kotobase/db_builder/data/processed/jlpt/kanji_n5.json +524 -0
kotobase-0.1.0/src/kotobase/db_builder/data/processed/jlpt/vocab_n1.json +17227 -0
kotobase-0.1.0/src/kotobase/db_builder/data/processed/jlpt/vocab_n2.json +8987 -0
kotobase-0.1.0/src/kotobase/db_builder/data/processed/jlpt/vocab_n3.json +9137 -0
kotobase-0.1.0/src/kotobase/db_builder/data/processed/jlpt/vocab_n4.json +3172 -0
kotobase-0.1.0/src/kotobase/db_builder/data/processed/jlpt/vocab_n5.json +3347 -0
kotobase-0.1.0/src/kotobase/db_builder/download.py +56 -0
kotobase-0.1.0/src/kotobase/db_builder/process_jmdict.py +137 -0
kotobase-0.1.0/src/kotobase/db_builder/process_jmnedict.py +133 -0
kotobase-0.1.0/src/kotobase/db_builder/process_kanjidic.py +92 -0
kotobase-0.1.0/src/kotobase/db_builder/process_tatoeba.py +47 -0
kotobase-0.1.0/src/kotobase/db_builder/pull.py +34 -0
kotobase-0.1.0/src/kotobase/repos/__init__.py +11 -0
kotobase-0.1.0/src/kotobase/repos/jlpt.py +56 -0
kotobase-0.1.0/src/kotobase/repos/jmdict.py +81 -0
kotobase-0.1.0/src/kotobase/repos/jmnedict.py +53 -0
kotobase-0.1.0/src/kotobase/repos/kanji.py +76 -0
kotobase-0.1.0/src/kotobase/repos/sentences.py +35 -0
kotobase-0.1.0/src/kotobase.egg-info/PKG-INFO +149 -0
kotobase-0.1.0/src/kotobase.egg-info/SOURCES.txt +51 -0
kotobase-0.1.0/src/kotobase.egg-info/dependency_links.txt +1 -0
kotobase-0.1.0/src/kotobase.egg-info/entry_points.txt +2 -0
kotobase-0.1.0/src/kotobase.egg-info/requires.txt +8 -0
kotobase-0.1.0/src/kotobase.egg-info/top_level.txt +1 -0

kotobase-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 svdc
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

kotobase-0.1.0/MANIFEST.in ADDED Viewed

	@@ -0,0 +1 @@
1	+ graft src/kotobase/db_builder/data

kotobase-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,149 @@
+Metadata-Version: 2.4
+Name: kotobase
+Version: 0.1.0
+Summary: Python package for accessing a comprehensive Japanese language database.
+Author-email: svdc <svdc1mail@gmail.com>
+Maintainer-email: svdc <svdc1mail@gmail.com>
+License-Expression: MIT
+Project-URL: Homepage, https://github.com/svdC1/kotobase
+Project-URL: Issues, https://github.com/svdC1/kotobase/issues
+Keywords: Japanese,Dictionary,Language
+Classifier: Programming Language :: Python :: 3
+Classifier: Operating System :: OS Independent
+Classifier: Intended Audience :: Developers
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: requests
+Requires-Dist: sqlalchemy
+Requires-Dist: alembic
+Requires-Dist: beautifulsoup4
+Requires-Dist: lxml
+Requires-Dist: pypdf
+Requires-Dist: click
+Requires-Dist: gdown
+Dynamic: license-file
+# Kotobase
+**Kotobase is a Japanese language Python package which provides simple programmatic access to various data sources via a pre-built database which is updated weekly via a GitHub action.**
+## Data Sources
+Kotobase uses data from these sources to build its Database.
+-   [`JMDict`](http://www.edrdg.org/jmdict/j_jmdict.html) : Japanese-Multilingual Dictionary.
+-   [`JMnedict`](http://www.edrdg.org/enamdict/enamdict_doc.html) : A dictionary of Japanese proper names.
+-   [`KanjiDic2`](http://www.edrdg.org/kanjidic/kanjd2index_legacy.html) : A comprehensive kanji dictionary.
+-   [`Tatoeba`](https://tatoeba.org/en/) : A large database of example sentences.
+-   [`JLPT Lists`](http://www.tanos.co.uk/) : Curated list of Grammar, Vocabulary and Kanji separated by Japanese Language Proficiency Test levels, made available on Jonathan Weller's website.
+### Licenses
+> The licenses of these data sources and the NOTICE is available at `docs/licenses` in this repository.
+## Features
+-   **Comprehensive Lookups** &rarr; Search for words (kanji, kana, or romaji), kanji, and proper names.
+-   **Organized Data** &rarr; Get detailed information including readings, senses, parts of speech, kanji stroke counts, meanings, and JLPT levels formatted into Python Data Objects.
+-   **Example Sentences** &rarr; Find example sentences from Tatoeba that contain the searched query.
+-   **Wildcard Search** &rarr; Use `*` or `%` for wildcard searches.
+-   **Command-Line Interface** &rarr; User-friendly CLI for quick lookups from the terminal.
+-   **Self-Contained** &rarr; All data is stored in a local SQLite database, so it's fast and works offline.
+-   **Easy Database Management** &rarr; Includes commands to automatically download the latest pre-built database from the public Drive or download source files and build the database locally.
+## Installation
+```bash
+pip install kotobase
+```
+This will install the `kotobase` package and its dependencies, and it will also make the `kotobase` command-line tool available in your shell.
+## Usage
+Kotobase can be used as a command-line tool or as a Python library.
+### Command-Line Interface
+The `kotobase` command provides several subcommands for different types of lookups.
+#### General Lookup
+The `lookup` command is the most comprehensive way to search for a word.
+```bash
+kotobase lookup 日本語
+```
+This will show you dictionary entries, kanji information, JLPT levels, and example sentences for the word "日本語".
+**Options:**
+-   `-n`, `--names`: Include proper names from JMnedict in the search.
+-   `-w`, `--wildcard`: Treat `*` or `%` as wildcards in the search term.
+-   `-s`, `--sentences`: Specify the number of example sentences to show.
+-   `--json-out`: Output the full results as a JSON object.
+#### Kanji Lookup
+To get information about a specific kanji character:
+```bash
+kotobase kanji 語
+```
+This will display the kanji's grade, stroke count, meanings, on'yomi, and kun'yomi readings, and JLPT level.
+#### JLPT Lookup
+To check the JLPT level for a word or kanji:
+```bash
+kotobase jlpt 勉強
+```
+### Python API
+You can also use Kotobase in your own Python code.
+```python
+from kotobase import Kotobase
+kb = Kotobase()
+# Comprehensive lookup
+result = kb.lookup("日本語")
+print(result.to_json(indent=2, ensure_ascii=False))
+# Get info for a single kanji
+kanji_info = kb.kanji("語")
+print(kanji_info)
+# Get example sentences
+sentences = kb.sentences("勉強")
+for sentence in sentences:
+    print(sentence.text)
+```
+## Database
+Kotobase relies on a local SQLite database.
+You can also build it from the source files yourself.
+The following commands are available for managing the database:
+-   `kotobase pull-db`: Downloads the pre-built SQLite database from a public [`Google Drive Folder`](https://drive.google.com/drive/u/0/folders/14wbgMyp0TubFyFaUy0W_CnK9_z7fo_Fv). This file is overwritten every week with a rebuilt database from updated sources. The rebuilding and overwriting is managed by a GitHub action in this repository.
+-   `kotobase build`: Builds the SQLite database from the raw source files. This will download the latest version of the source files (_Except Tanos JLPT lists which are shipped with the package itself._) and build the database locally.

kotobase-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,123 @@
+# Kotobase
+**Kotobase is a Japanese language Python package which provides simple programmatic access to various data sources via a pre-built database which is updated weekly via a GitHub action.**
+## Data Sources
+Kotobase uses data from these sources to build its Database.
+-   [`JMDict`](http://www.edrdg.org/jmdict/j_jmdict.html) : Japanese-Multilingual Dictionary.
+-   [`JMnedict`](http://www.edrdg.org/enamdict/enamdict_doc.html) : A dictionary of Japanese proper names.
+-   [`KanjiDic2`](http://www.edrdg.org/kanjidic/kanjd2index_legacy.html) : A comprehensive kanji dictionary.
+-   [`Tatoeba`](https://tatoeba.org/en/) : A large database of example sentences.
+-   [`JLPT Lists`](http://www.tanos.co.uk/) : Curated list of Grammar, Vocabulary and Kanji separated by Japanese Language Proficiency Test levels, made available on Jonathan Weller's website.
+### Licenses
+> The licenses of these data sources and the NOTICE is available at `docs/licenses` in this repository.
+## Features
+-   **Comprehensive Lookups** &rarr; Search for words (kanji, kana, or romaji), kanji, and proper names.
+-   **Organized Data** &rarr; Get detailed information including readings, senses, parts of speech, kanji stroke counts, meanings, and JLPT levels formatted into Python Data Objects.
+-   **Example Sentences** &rarr; Find example sentences from Tatoeba that contain the searched query.
+-   **Wildcard Search** &rarr; Use `*` or `%` for wildcard searches.
+-   **Command-Line Interface** &rarr; User-friendly CLI for quick lookups from the terminal.
+-   **Self-Contained** &rarr; All data is stored in a local SQLite database, so it's fast and works offline.
+-   **Easy Database Management** &rarr; Includes commands to automatically download the latest pre-built database from the public Drive or download source files and build the database locally.
+## Installation
+```bash
+pip install kotobase
+```
+This will install the `kotobase` package and its dependencies, and it will also make the `kotobase` command-line tool available in your shell.
+## Usage
+Kotobase can be used as a command-line tool or as a Python library.
+### Command-Line Interface
+The `kotobase` command provides several subcommands for different types of lookups.
+#### General Lookup
+The `lookup` command is the most comprehensive way to search for a word.
+```bash
+kotobase lookup 日本語
+```
+This will show you dictionary entries, kanji information, JLPT levels, and example sentences for the word "日本語".
+**Options:**
+-   `-n`, `--names`: Include proper names from JMnedict in the search.
+-   `-w`, `--wildcard`: Treat `*` or `%` as wildcards in the search term.
+-   `-s`, `--sentences`: Specify the number of example sentences to show.
+-   `--json-out`: Output the full results as a JSON object.
+#### Kanji Lookup
+To get information about a specific kanji character:
+```bash
+kotobase kanji 語
+```
+This will display the kanji's grade, stroke count, meanings, on'yomi, and kun'yomi readings, and JLPT level.
+#### JLPT Lookup
+To check the JLPT level for a word or kanji:
+```bash
+kotobase jlpt 勉強
+```
+### Python API
+You can also use Kotobase in your own Python code.
+```python
+from kotobase import Kotobase
+kb = Kotobase()
+# Comprehensive lookup
+result = kb.lookup("日本語")
+print(result.to_json(indent=2, ensure_ascii=False))
+# Get info for a single kanji
+kanji_info = kb.kanji("語")
+print(kanji_info)
+# Get example sentences
+sentences = kb.sentences("勉強")
+for sentence in sentences:
+    print(sentence.text)
+```
+## Database
+Kotobase relies on a local SQLite database.
+You can also build it from the source files yourself.
+The following commands are available for managing the database:
+-   `kotobase pull-db`: Downloads the pre-built SQLite database from a public [`Google Drive Folder`](https://drive.google.com/drive/u/0/folders/14wbgMyp0TubFyFaUy0W_CnK9_z7fo_Fv). This file is overwritten every week with a rebuilt database from updated sources. The rebuilding and overwriting is managed by a GitHub action in this repository.
+-   `kotobase build`: Builds the SQLite database from the raw source files. This will download the latest version of the source files (_Except Tanos JLPT lists which are shipped with the package itself._) and build the database locally.

kotobase-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,42 @@
+[build-system]
+requires = ["setuptools>=77.0.3"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "kotobase"
+version = "0.1.0"
+authors = [{ name = "svdc", email = "svdc1mail@gmail.com" }]
+maintainers = [{ name = "svdc", email = "svdc1mail@gmail.com" }]
+description = "Python package for accessing a comprehensive Japanese language database."
+keywords = ["Japanese", "Dictionary", "Language"]
+readme = "README.md"
+requires-python = ">=3.9"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "Operating System :: OS Independent",
+    "Intended Audience :: Developers",
+]
+license = "MIT"
+license-files = ["LICEN[CS]E*"]
+dependencies = [
+    "requests",
+    "sqlalchemy",
+    "alembic",
+    "beautifulsoup4",
+    "lxml",
+    "pypdf",
+    "click",
+    "gdown",
+]
+[project.scripts]
+kotobase = "kotobase.cli:main"
+[project.urls]
+Homepage = "https://github.com/svdC1/kotobase"
+Issues = "https://github.com/svdC1/kotobase/issues"
+[tool.setuptools]
+include-package-data = true
+[tool.setuptools.packages.find]
+where = ["src"]

kotobase-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

kotobase-0.1.0/src/kotobase/__init__.py ADDED Viewed

@@ -0,0 +1,20 @@
+from .db.database import get_db
+from .api import Kotobase
+from .db import models
+from . import (core,
+               api,
+               db,
+               db_builder,
+               repos,
+               cli)
+__all__ = ["Kotobase",
+           "get_db",
+           "models",
+           "core",
+           "api",
+           "db",
+           "db_builder",
+           "repos",
+           "cli"]

kotobase-0.1.0/src/kotobase/api.py ADDED Viewed

@@ -0,0 +1,130 @@
+from __future__ import annotations
+import concurrent.futures as _cf
+from functools import lru_cache
+from typing import (List,
+                    Dict,
+                    Optional)
+from kotobase.core.datatypes import (
+    LookupResult,
+    KanjiDTO,
+    JMDictEntryDTO,
+    JMNeDictEntryDTO,
+    )
+from kotobase.repos.jmdict import JMDictRepo
+from kotobase.repos.jmnedict import JMNeDictRepo
+from kotobase.repos.kanji import KanjiRepo
+from kotobase.repos.jlpt import JLPTRepo
+from kotobase.repos.sentences import SentenceRepo
+class Kotobase:
+    """
+    Stateless class that orchestrates the individual repositories and
+    returns rich, serialisable objects.
+    """
+    # Core
+    def lookup(
+        self,
+        word: str,
+        *,
+        wildcard: bool = False,
+        include_names: bool = False,
+        sentence_limit: int = 50,
+    ) -> LookupResult:
+        """
+        Comprehensive word lookup.
+        Parameters
+        ----------
+        word : str
+            The query string (kana, kanji, or romaji transliteration).
+            Supports SQL wildcards '*' or '%'.
+        wildcard : bool, default False
+            If True, passes wildcards through unchanged.  If False,
+            the search is exact (JMdict) but Tatoeba uses `%word%`
+            containment.
+        include_names : bool, default False
+            Also query JMnedict (proper names).  Can be slow on very
+            broad wildcards.
+        sentence_limit : int, default 50
+            Maximum number of Tatoeba sentences to fetch.
+        Returns
+        -------
+        LookupResult
+        """
+        # 1. Find dictionary entries (JMdict & optionally JMnedict)
+        entries: List[JMDictEntryDTO | JMNeDictEntryDTO] = []
+        entries.extend(
+            JMDictRepo.search_form(word, limit=None if wildcard else 50))
+        if include_names:
+            entries.extend(JMNeDictRepo.search(word, limit=50))
+        # 2. Extract unique kanji found in the *query* itself
+        kanji_chars = [c for c in word if "\u4e00" <= c <= "\u9fff"]
+        kanji_info: List[KanjiDTO] = KanjiRepo.bulk_fetch(kanji_chars)
+        # 3. Parallel extra look-ups (JLPT + sentences) -----------------
+        with _cf.ThreadPoolExecutor(max_workers=3) as pool:
+            f_vocab = pool.submit(JLPTRepo.vocab_by_word, word)
+            f_levels = pool.submit(JLPTRepo.kanji_levels, kanji_chars)
+            f_grammar = pool.submit(JLPTRepo.grammar_entries_like, word)
+            f_sent = pool.submit(
+                SentenceRepo.search_containing,
+                word,
+                limit=sentence_limit,
+                wildcard=wildcard,
+            )
+            jlpt_vocab = f_vocab.result()
+            jlpt_kanji_levels: Dict[str, int] = f_levels.result()
+            jlpt_grammar = f_grammar.result()
+            sentences = f_sent.result()
+        # 4. Aggregate
+        return LookupResult(
+            word=word,
+            entries=entries,
+            kanji=kanji_info,
+            jlpt_vocab=jlpt_vocab,
+            jlpt_kanji_levels=jlpt_kanji_levels,
+            jlpt_grammar=jlpt_grammar,
+            examples=sentences,
+        )
+    # Convenience Wrappers
+    @staticmethod
+    @lru_cache(maxsize=10_000)
+    def kanji(literal: str):
+        """Return a single KanjiDTO (or None)."""
+        return KanjiRepo.by_literal(literal)
+    @staticmethod
+    @lru_cache(maxsize=20_000)
+    def jlpt_level(word: str) -> Optional[int]:
+        """Shortcut – just return JLPT vocab level for a word."""
+        dto = JLPTRepo.vocab_by_word(word)
+        return dto.level if dto else None
+    @staticmethod
+    def sentences(text: str, *, limit: int = 20):
+        """Fetch Japanese Tatoeba sentences containing *text*."""
+        return SentenceRepo.search_containing(text, limit=limit)
+    def __call__(self, word: str, **kwargs):
+        """Alias for `lookup` so you can `Kotobase()(word)`."""
+        return self.lookup(word, **kwargs)
+    # Context Manager
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc, tb):
+        # Propagate Exceptions
+        return False
+__all__ = ["Kotobase"]