PyPI - chiban-extract - Versions diffs - 0.1.0__tar.gz - Mend

chiban-extract 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

chiban_extract-0.1.0/.github/workflows/ci.yml +25 -0
chiban_extract-0.1.0/.gitignore +10 -0
chiban_extract-0.1.0/LICENSE +21 -0
chiban_extract-0.1.0/PKG-INFO +174 -0
chiban_extract-0.1.0/README.md +146 -0
chiban_extract-0.1.0/pyproject.toml +51 -0
chiban_extract-0.1.0/src/chiban_extract/__init__.py +49 -0
chiban_extract-0.1.0/src/chiban_extract/cli.py +76 -0
chiban_extract-0.1.0/src/chiban_extract/extract.py +176 -0
chiban_extract-0.1.0/src/chiban_extract/models.py +89 -0
chiban_extract-0.1.0/src/chiban_extract/normalize.py +25 -0
chiban_extract-0.1.0/src/chiban_extract/patterns.py +55 -0
chiban_extract-0.1.0/src/chiban_extract/pdf.py +45 -0
chiban_extract-0.1.0/src/chiban_extract/pipeline.py +88 -0
chiban_extract-0.1.0/src/chiban_extract/tokens.py +80 -0
chiban_extract-0.1.0/tests/conftest.py +63 -0
chiban_extract-0.1.0/tests/test_cli.py +43 -0
chiban_extract-0.1.0/tests/test_extract.py +83 -0
chiban_extract-0.1.0/tests/test_patterns.py +57 -0
chiban_extract-0.1.0/tests/test_pdf.py +51 -0
chiban_extract-0.1.0/tests/test_pipeline.py +58 -0
chiban_extract-0.1.0/tests/test_tokens.py +44 -0

chiban_extract-0.1.0/.github/workflows/ci.yml ADDED Viewed

@@ -0,0 +1,25 @@
+name: CI
+on:
+  push:
+    branches: [main]
+  pull_request:
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.10", "3.11", "3.12", "3.13"]
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install
+        run: pip install -e ".[dev]"
+      - name: Lint
+        run: ruff check src tests
+      - name: Test
+        run: pytest

chiban_extract-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,10 @@
+__pycache__/
+*.pyc
+*.egg-info/
+dist/
+build/
+.pytest_cache/
+.ruff_cache/
+.venv/
+venv/
+.env

chiban_extract-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 chiban-extract contributors
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

chiban_extract-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,174 @@
+Metadata-Version: 2.4
+Name: chiban-extract
+Version: 0.1.0
+Summary: Extract Japanese street addresses (住居表示) and land lot numbers (地番) from text and PDFs
+Project-URL: Homepage, https://github.com/91st1213-blip/chiban-extract
+Project-URL: Issues, https://github.com/91st1213-blip/chiban-extract/issues
+Author: chiban-extract contributors
+License: MIT
+License-File: LICENSE
+Keywords: address,chiban,japan,japanese,pdf,real-estate,text-extraction
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Natural Language :: English
+Classifier: Natural Language :: Japanese
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Text Processing
+Requires-Python: >=3.10
+Requires-Dist: pymupdf>=1.24
+Provides-Extra: dev
+Requires-Dist: pytest>=8; extra == 'dev'
+Requires-Dist: ruff>=0.4; extra == 'dev'
+Description-Content-Type: text/markdown
+# chiban-extract
+Extract Japanese street addresses (住居表示) and registered land lot numbers
+(地番) from plain text and PDFs.
+日本語のテキスト・PDFから住居表示と地番を抽出する Python ライブラリ & CLI です。
+[English](#english) | [日本語](#日本語)
+---
+## English
+### Why?
+Japanese property documents — contracts, registry papers, brochures, press
+releases — write a property's location in inconsistent formats: as a
+residential indication (住居表示, `…1番1号`), as a registered lot number
+(地番, `…788番地 他3筆`), with kanji numerals, or with whitespace and line
+breaks injected by a PDF text layer. Generic address parsers choke on this;
+`chiban-extract`'s regexes were battle-tested against hundreds of real
+documents.
+It also solves the problems *around* extraction:
+- **Multi-property documents** — when one document covers several properties
+  with similar names, extraction can be scoped to the vicinity of one
+  property name, with confusion warnings.
+- **Validation against known data** — candidates are cross-checked against a
+  known prefecture / city / ward so you don't accidentally pick up another
+  party's address mentioned in the same document.
+- **Candidate ranking** — every address-like string is returned, with those
+  near an address keyword (住居表示 / 所在地 / 地番 / 住所) ranked first.
+### Install
+```bash
+pip install chiban-extract
+```
+Requires Python 3.10+. The only dependency is PyMuPDF (and only the
+text-based APIs can be used without it ever being imported).
+### Quick start
+```python
+from chiban_extract import extract_from_pdf, extract_from_text
+result = extract_from_pdf("document.pdf")                  # path, URL, or bytes
+print(result.best.address if result.best else None)        # 東京都港区南青山1丁目1番1号
+print(result.best.kind if result.best else None)           # AddressKind.JUKYO / CHIBAN
+for c in result.candidates:
+    print(c.address, c.kind, c.near_keyword)
+```
+Scoped extraction for multi-property documents, validated against a known
+address:
+```python
+result = extract_from_pdf(
+    "multi_property.pdf",
+    property_name="サンプルレジデンス戸越公園",
+    known_address="東京都品川区",
+    other_property_names=["サンプルレジデンス東大井"],
+)
+```
+### CLI
+```bash
+chiban-extract extract document.pdf --json
+chiban-extract extract https://example.com/document.pdf --known-address "東京都港区"
+chiban-extract extract document.pdf --property-name "サンプルビル" --known-address "東京都港区"
+```
+Exit codes: `0` success, `1` nothing found, `2` input error.
+### How it works
+1. PDF text is extracted with PyMuPDF and NFKC-normalized.
+2. A keyword-driven pass looks for an address right after 「住居表示」
+   (preferred — it identifies the building) and falls back to lot numbers
+   after 「所在地」/「地番」.
+3. A pattern-driven pass collects all address-like strings, ranking those
+   within 200 chars of an address keyword first.
+4. Candidates that contradict the known prefecture / city / ward are dropped.
+### License
+MIT
+---
+## 日本語
+### 概要
+不動産関連文書(契約書・登記資料・パンフレット・プレスリリース等)に
+記載される所在地は、住居表示(`…1番1号`)・地番(`…788番地 他3筆`)・
+漢数字丁目・PDFテキスト層由来の空白/改行混入などフォーマットが安定しません。
+本ライブラリの正規表現は数百件の実文書で鍛えたもので、抽出まわりの
+周辺問題もまとめて解決します。
+- **複数物件文書対応** — 物件名近傍にスコープした抽出と、類似名物件との
+  混同警告。
+- **既知住所との突合** — 都道府県/市/区トークンで候補を検証し、同一文書中の
+  別当事者住所の誤抽出を防ぎます。
+- **候補ランキング** — 住所らしき文字列を全件返し、キーワード
+  (住居表示/所在地/地番/住所)近傍のものを優先します。
+### インストール
+```bash
+pip install chiban-extract
+```
+Python 3.10 以上が必要です。依存は PyMuPDF のみです。
+### 使い方
+```python
+from chiban_extract import extract_from_text, extract_from_pdf
+result = extract_from_pdf("document.pdf", known_address="東京都港区")
+if result.best:
+    print(result.best.address, result.best.kind)  # jukyo (住居表示) / chiban (地番)
+```
+CLI:
+```bash
+chiban-extract extract document.pdf --json
+chiban-extract extract document.pdf --known-address "東京都港区"
+```
+### 抽出ロジック
+1. PyMuPDF で全文抽出 → NFKC 正規化
+2. 「住居表示」直後の住所を優先抽出(建物を一意に示すため)、無ければ
+   「所在地」「地番」直後の地番にフォールバック
+3. 汎用住所パターンで全候補を収集し、キーワード近傍(200字以内)を優先
+4. 既知住所の都道府県/市/区/町トークンと矛盾する候補を除外
+### ライセンス
+MIT

chiban_extract-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,146 @@
+# chiban-extract
+Extract Japanese street addresses (住居表示) and registered land lot numbers
+(地番) from plain text and PDFs.
+日本語のテキスト・PDFから住居表示と地番を抽出する Python ライブラリ & CLI です。
+[English](#english) | [日本語](#日本語)
+---
+## English
+### Why?
+Japanese property documents — contracts, registry papers, brochures, press
+releases — write a property's location in inconsistent formats: as a
+residential indication (住居表示, `…1番1号`), as a registered lot number
+(地番, `…788番地 他3筆`), with kanji numerals, or with whitespace and line
+breaks injected by a PDF text layer. Generic address parsers choke on this;
+`chiban-extract`'s regexes were battle-tested against hundreds of real
+documents.
+It also solves the problems *around* extraction:
+- **Multi-property documents** — when one document covers several properties
+  with similar names, extraction can be scoped to the vicinity of one
+  property name, with confusion warnings.
+- **Validation against known data** — candidates are cross-checked against a
+  known prefecture / city / ward so you don't accidentally pick up another
+  party's address mentioned in the same document.
+- **Candidate ranking** — every address-like string is returned, with those
+  near an address keyword (住居表示 / 所在地 / 地番 / 住所) ranked first.
+### Install
+```bash
+pip install chiban-extract
+```
+Requires Python 3.10+. The only dependency is PyMuPDF (and only the
+text-based APIs can be used without it ever being imported).
+### Quick start
+```python
+from chiban_extract import extract_from_pdf, extract_from_text
+result = extract_from_pdf("document.pdf")                  # path, URL, or bytes
+print(result.best.address if result.best else None)        # 東京都港区南青山1丁目1番1号
+print(result.best.kind if result.best else None)           # AddressKind.JUKYO / CHIBAN
+for c in result.candidates:
+    print(c.address, c.kind, c.near_keyword)
+```
+Scoped extraction for multi-property documents, validated against a known
+address:
+```python
+result = extract_from_pdf(
+    "multi_property.pdf",
+    property_name="サンプルレジデンス戸越公園",
+    known_address="東京都品川区",
+    other_property_names=["サンプルレジデンス東大井"],
+)
+```
+### CLI
+```bash
+chiban-extract extract document.pdf --json
+chiban-extract extract https://example.com/document.pdf --known-address "東京都港区"
+chiban-extract extract document.pdf --property-name "サンプルビル" --known-address "東京都港区"
+```
+Exit codes: `0` success, `1` nothing found, `2` input error.
+### How it works
+1. PDF text is extracted with PyMuPDF and NFKC-normalized.
+2. A keyword-driven pass looks for an address right after 「住居表示」
+   (preferred — it identifies the building) and falls back to lot numbers
+   after 「所在地」/「地番」.
+3. A pattern-driven pass collects all address-like strings, ranking those
+   within 200 chars of an address keyword first.
+4. Candidates that contradict the known prefecture / city / ward are dropped.
+### License
+MIT
+---
+## 日本語
+### 概要
+不動産関連文書(契約書・登記資料・パンフレット・プレスリリース等)に
+記載される所在地は、住居表示(`…1番1号`)・地番(`…788番地 他3筆`)・
+漢数字丁目・PDFテキスト層由来の空白/改行混入などフォーマットが安定しません。
+本ライブラリの正規表現は数百件の実文書で鍛えたもので、抽出まわりの
+周辺問題もまとめて解決します。
+- **複数物件文書対応** — 物件名近傍にスコープした抽出と、類似名物件との
+  混同警告。
+- **既知住所との突合** — 都道府県/市/区トークンで候補を検証し、同一文書中の
+  別当事者住所の誤抽出を防ぎます。
+- **候補ランキング** — 住所らしき文字列を全件返し、キーワード
+  (住居表示/所在地/地番/住所)近傍のものを優先します。
+### インストール
+```bash
+pip install chiban-extract
+```
+Python 3.10 以上が必要です。依存は PyMuPDF のみです。
+### 使い方
+```python
+from chiban_extract import extract_from_text, extract_from_pdf
+result = extract_from_pdf("document.pdf", known_address="東京都港区")
+if result.best:
+    print(result.best.address, result.best.kind)  # jukyo (住居表示) / chiban (地番)
+```
+CLI:
+```bash
+chiban-extract extract document.pdf --json
+chiban-extract extract document.pdf --known-address "東京都港区"
+```
+### 抽出ロジック
+1. PyMuPDF で全文抽出 → NFKC 正規化
+2. 「住居表示」直後の住所を優先抽出(建物を一意に示すため)、無ければ
+   「所在地」「地番」直後の地番にフォールバック
+3. 汎用住所パターンで全候補を収集し、キーワード近傍(200字以内)を優先
+4. 既知住所の都道府県/市/区/町トークンと矛盾する候補を除外
+### ライセンス
+MIT

chiban_extract-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,51 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[project]
+name = "chiban-extract"
+version = "0.1.0"
+description = "Extract Japanese street addresses (住居表示) and land lot numbers (地番) from text and PDFs"
+readme = "README.md"
+license = { text = "MIT" }
+requires-python = ">=3.10"
+authors = [{ name = "chiban-extract contributors" }]
+keywords = ["japan", "japanese", "address", "chiban", "pdf", "text-extraction", "real-estate"]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: MIT License",
+    "Natural Language :: Japanese",
+    "Natural Language :: English",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Topic :: Text Processing",
+]
+dependencies = ["pymupdf>=1.24"]
+[project.optional-dependencies]
+dev = ["pytest>=8", "ruff>=0.4"]
+[project.scripts]
+chiban-extract = "chiban_extract.cli:main"
+[project.urls]
+Homepage = "https://github.com/91st1213-blip/chiban-extract"
+Issues = "https://github.com/91st1213-blip/chiban-extract/issues"
+[tool.hatch.build.targets.wheel]
+packages = ["src/chiban_extract"]
+[tool.ruff]
+line-length = 120
+src = ["src", "tests"]
+[tool.ruff.lint]
+select = ["E", "F", "W", "I", "UP", "B"]
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+addopts = "-q"

chiban_extract-0.1.0/src/chiban_extract/__init__.py ADDED Viewed

@@ -0,0 +1,49 @@
+"""chiban-extract: Japanese street addresses & land lot numbers out of text and PDFs.
+Quick start::
+    from chiban_extract import extract_from_pdf
+    result = extract_from_pdf("document.pdf", known_address="東京都港区")
+    print(result.best.address if result.best else "not found")
+"""
+from .extract import (
+    extract_address_for_property,
+    extract_preferred_address,
+    find_address_candidates,
+)
+from .models import (
+    AddressCandidate,
+    AddressKind,
+    AddressMatch,
+    AddressTokens,
+    ExtractionResult,
+    PdfDocument,
+)
+from .normalize import canonicalize_address, normalize_text
+from .pdf import read_pdf
+from .pipeline import extract_from_pdf, extract_from_text
+from .tokens import has_block_number, parse_address_tokens
+__version__ = "0.1.0"
+__all__ = [
+    "AddressCandidate",
+    "AddressKind",
+    "AddressMatch",
+    "AddressTokens",
+    "ExtractionResult",
+    "PdfDocument",
+    "canonicalize_address",
+    "extract_address_for_property",
+    "extract_from_pdf",
+    "extract_from_text",
+    "extract_preferred_address",
+    "find_address_candidates",
+    "has_block_number",
+    "normalize_text",
+    "parse_address_tokens",
+    "read_pdf",
+    "__version__",
+]

chiban_extract-0.1.0/src/chiban_extract/cli.py ADDED Viewed

@@ -0,0 +1,76 @@
+"""Command-line interface.
+    chiban-extract extract <pdf-or-url> [--property-name N] [--known-address A] [--json]
+Exit codes: 0 = success, 1 = nothing found, 2 = input/usage error.
+"""
+from __future__ import annotations
+import argparse
+import json
+import sys
+from .pdf import read_pdf
+from .pipeline import extract_from_text
+def _print_json(obj: dict) -> None:
+    print(json.dumps(obj, ensure_ascii=False, indent=2))
+def _cmd_extract(args: argparse.Namespace) -> int:
+    try:
+        doc = read_pdf(args.source)
+    except Exception as e:
+        print(f"error: could not read PDF: {e}", file=sys.stderr)
+        return 2
+    result = extract_from_text(
+        doc.text,
+        page_count=doc.page_count,
+        property_name=args.property_name,
+        known_address=args.known_address,
+        other_property_names=args.other_property or None,
+    )
+    if args.json:
+        _print_json(result.to_dict())
+    else:
+        if result.best:
+            print(f"best: {result.best.address} [{result.best.kind.value}]")
+        else:
+            print("best: (none)")
+        for c in result.candidates[:10]:
+            marker = "*" if result.best and c.address == result.best.address else " "
+            print(f"  {marker} {c.address} [{c.kind.value}]"
+                  f"{' (near keyword)' if c.near_keyword else ''}")
+        for w in result.warnings:
+            print(f"warning: {w}", file=sys.stderr)
+    return 0 if result.best else 1
+def build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        prog="chiban-extract",
+        description="Extract Japanese addresses and land lot numbers (chiban) from PDFs",
+    )
+    sub = p.add_subparsers(dest="command", required=True)
+    pe = sub.add_parser("extract", help="extract addresses from a PDF (path or URL)")
+    pe.add_argument("source", help="PDF file path or http(s) URL")
+    pe.add_argument("--property-name", help="scope extraction near this property name")
+    pe.add_argument("--known-address", help="validate candidates against this known address")
+    pe.add_argument("--other-property", action="append", default=[],
+                    help="other property names in the same document (repeatable)")
+    pe.add_argument("--json", action="store_true", help="output JSON")
+    pe.set_defaults(func=_cmd_extract)
+    return p
+def main(argv: list[str] | None = None) -> int:
+    args = build_parser().parse_args(argv)
+    return args.func(args)
+if __name__ == "__main__":
+    sys.exit(main())