chiban-extract 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,25 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+
8
+ jobs:
9
+ test:
10
+ runs-on: ubuntu-latest
11
+ strategy:
12
+ fail-fast: false
13
+ matrix:
14
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+ - uses: actions/setup-python@v5
18
+ with:
19
+ python-version: ${{ matrix.python-version }}
20
+ - name: Install
21
+ run: pip install -e ".[dev]"
22
+ - name: Lint
23
+ run: ruff check src tests
24
+ - name: Test
25
+ run: pytest
@@ -0,0 +1,10 @@
1
+ __pycache__/
2
+ *.pyc
3
+ *.egg-info/
4
+ dist/
5
+ build/
6
+ .pytest_cache/
7
+ .ruff_cache/
8
+ .venv/
9
+ venv/
10
+ .env
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 chiban-extract contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,174 @@
1
+ Metadata-Version: 2.4
2
+ Name: chiban-extract
3
+ Version: 0.1.0
4
+ Summary: Extract Japanese street addresses (住居表示) and land lot numbers (地番) from text and PDFs
5
+ Project-URL: Homepage, https://github.com/91st1213-blip/chiban-extract
6
+ Project-URL: Issues, https://github.com/91st1213-blip/chiban-extract/issues
7
+ Author: chiban-extract contributors
8
+ License: MIT
9
+ License-File: LICENSE
10
+ Keywords: address,chiban,japan,japanese,pdf,real-estate,text-extraction
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Natural Language :: English
15
+ Classifier: Natural Language :: Japanese
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Text Processing
22
+ Requires-Python: >=3.10
23
+ Requires-Dist: pymupdf>=1.24
24
+ Provides-Extra: dev
25
+ Requires-Dist: pytest>=8; extra == 'dev'
26
+ Requires-Dist: ruff>=0.4; extra == 'dev'
27
+ Description-Content-Type: text/markdown
28
+
29
+ # chiban-extract
30
+
31
+ Extract Japanese street addresses (住居表示) and registered land lot numbers
32
+ (地番) from plain text and PDFs.
33
+
34
+ 日本語のテキスト・PDFから住居表示と地番を抽出する Python ライブラリ & CLI です。
35
+
36
+ [English](#english) | [日本語](#日本語)
37
+
38
+ ---
39
+
40
+ ## English
41
+
42
+ ### Why?
43
+
44
+ Japanese property documents — contracts, registry papers, brochures, press
45
+ releases — write a property's location in inconsistent formats: as a
46
+ residential indication (住居表示, `…1番1号`), as a registered lot number
47
+ (地番, `…788番地 他3筆`), with kanji numerals, or with whitespace and line
48
+ breaks injected by a PDF text layer. Generic address parsers choke on this;
49
+ `chiban-extract`'s regexes were battle-tested against hundreds of real
50
+ documents.
51
+
52
+ It also solves the problems *around* extraction:
53
+
54
+ - **Multi-property documents** — when one document covers several properties
55
+ with similar names, extraction can be scoped to the vicinity of one
56
+ property name, with confusion warnings.
57
+ - **Validation against known data** — candidates are cross-checked against a
58
+ known prefecture / city / ward so you don't accidentally pick up another
59
+ party's address mentioned in the same document.
60
+ - **Candidate ranking** — every address-like string is returned, with those
61
+ near an address keyword (住居表示 / 所在地 / 地番 / 住所) ranked first.
62
+
63
+ ### Install
64
+
65
+ ```bash
66
+ pip install chiban-extract
67
+ ```
68
+
69
+ Requires Python 3.10+. The only dependency is PyMuPDF (and only the
70
+ text-based APIs can be used without it ever being imported).
71
+
72
+ ### Quick start
73
+
74
+ ```python
75
+ from chiban_extract import extract_from_pdf, extract_from_text
76
+
77
+ result = extract_from_pdf("document.pdf") # path, URL, or bytes
78
+ print(result.best.address if result.best else None) # 東京都港区南青山1丁目1番1号
79
+ print(result.best.kind if result.best else None) # AddressKind.JUKYO / CHIBAN
80
+ for c in result.candidates:
81
+ print(c.address, c.kind, c.near_keyword)
82
+ ```
83
+
84
+ Scoped extraction for multi-property documents, validated against a known
85
+ address:
86
+
87
+ ```python
88
+ result = extract_from_pdf(
89
+ "multi_property.pdf",
90
+ property_name="サンプルレジデンス戸越公園",
91
+ known_address="東京都品川区",
92
+ other_property_names=["サンプルレジデンス東大井"],
93
+ )
94
+ ```
95
+
96
+ ### CLI
97
+
98
+ ```bash
99
+ chiban-extract extract document.pdf --json
100
+ chiban-extract extract https://example.com/document.pdf --known-address "東京都港区"
101
+ chiban-extract extract document.pdf --property-name "サンプルビル" --known-address "東京都港区"
102
+ ```
103
+
104
+ Exit codes: `0` success, `1` nothing found, `2` input error.
105
+
106
+ ### How it works
107
+
108
+ 1. PDF text is extracted with PyMuPDF and NFKC-normalized.
109
+ 2. A keyword-driven pass looks for an address right after 「住居表示」
110
+ (preferred — it identifies the building) and falls back to lot numbers
111
+ after 「所在地」/「地番」.
112
+ 3. A pattern-driven pass collects all address-like strings, ranking those
113
+ within 200 chars of an address keyword first.
114
+ 4. Candidates that contradict the known prefecture / city / ward are dropped.
115
+
116
+ ### License
117
+
118
+ MIT
119
+
120
+ ---
121
+
122
+ ## 日本語
123
+
124
+ ### 概要
125
+
126
+ 不動産関連文書(契約書・登記資料・パンフレット・プレスリリース等)に
127
+ 記載される所在地は、住居表示(`…1番1号`)・地番(`…788番地 他3筆`)・
128
+ 漢数字丁目・PDFテキスト層由来の空白/改行混入などフォーマットが安定しません。
129
+ 本ライブラリの正規表現は数百件の実文書で鍛えたもので、抽出まわりの
130
+ 周辺問題もまとめて解決します。
131
+
132
+ - **複数物件文書対応** — 物件名近傍にスコープした抽出と、類似名物件との
133
+ 混同警告。
134
+ - **既知住所との突合** — 都道府県/市/区トークンで候補を検証し、同一文書中の
135
+ 別当事者住所の誤抽出を防ぎます。
136
+ - **候補ランキング** — 住所らしき文字列を全件返し、キーワード
137
+ (住居表示/所在地/地番/住所)近傍のものを優先します。
138
+
139
+ ### インストール
140
+
141
+ ```bash
142
+ pip install chiban-extract
143
+ ```
144
+
145
+ Python 3.10 以上が必要です。依存は PyMuPDF のみです。
146
+
147
+ ### 使い方
148
+
149
+ ```python
150
+ from chiban_extract import extract_from_text, extract_from_pdf
151
+
152
+ result = extract_from_pdf("document.pdf", known_address="東京都港区")
153
+ if result.best:
154
+ print(result.best.address, result.best.kind) # jukyo (住居表示) / chiban (地番)
155
+ ```
156
+
157
+ CLI:
158
+
159
+ ```bash
160
+ chiban-extract extract document.pdf --json
161
+ chiban-extract extract document.pdf --known-address "東京都港区"
162
+ ```
163
+
164
+ ### 抽出ロジック
165
+
166
+ 1. PyMuPDF で全文抽出 → NFKC 正規化
167
+ 2. 「住居表示」直後の住所を優先抽出(建物を一意に示すため)、無ければ
168
+ 「所在地」「地番」直後の地番にフォールバック
169
+ 3. 汎用住所パターンで全候補を収集し、キーワード近傍(200字以内)を優先
170
+ 4. 既知住所の都道府県/市/区/町トークンと矛盾する候補を除外
171
+
172
+ ### ライセンス
173
+
174
+ MIT
@@ -0,0 +1,146 @@
1
+ # chiban-extract
2
+
3
+ Extract Japanese street addresses (住居表示) and registered land lot numbers
4
+ (地番) from plain text and PDFs.
5
+
6
+ 日本語のテキスト・PDFから住居表示と地番を抽出する Python ライブラリ & CLI です。
7
+
8
+ [English](#english) | [日本語](#日本語)
9
+
10
+ ---
11
+
12
+ ## English
13
+
14
+ ### Why?
15
+
16
+ Japanese property documents — contracts, registry papers, brochures, press
17
+ releases — write a property's location in inconsistent formats: as a
18
+ residential indication (住居表示, `…1番1号`), as a registered lot number
19
+ (地番, `…788番地 他3筆`), with kanji numerals, or with whitespace and line
20
+ breaks injected by a PDF text layer. Generic address parsers choke on this;
21
+ `chiban-extract`'s regexes were battle-tested against hundreds of real
22
+ documents.
23
+
24
+ It also solves the problems *around* extraction:
25
+
26
+ - **Multi-property documents** — when one document covers several properties
27
+ with similar names, extraction can be scoped to the vicinity of one
28
+ property name, with confusion warnings.
29
+ - **Validation against known data** — candidates are cross-checked against a
30
+ known prefecture / city / ward so you don't accidentally pick up another
31
+ party's address mentioned in the same document.
32
+ - **Candidate ranking** — every address-like string is returned, with those
33
+ near an address keyword (住居表示 / 所在地 / 地番 / 住所) ranked first.
34
+
35
+ ### Install
36
+
37
+ ```bash
38
+ pip install chiban-extract
39
+ ```
40
+
41
+ Requires Python 3.10+. The only dependency is PyMuPDF (and only the
42
+ text-based APIs can be used without it ever being imported).
43
+
44
+ ### Quick start
45
+
46
+ ```python
47
+ from chiban_extract import extract_from_pdf, extract_from_text
48
+
49
+ result = extract_from_pdf("document.pdf") # path, URL, or bytes
50
+ print(result.best.address if result.best else None) # 東京都港区南青山1丁目1番1号
51
+ print(result.best.kind if result.best else None) # AddressKind.JUKYO / CHIBAN
52
+ for c in result.candidates:
53
+ print(c.address, c.kind, c.near_keyword)
54
+ ```
55
+
56
+ Scoped extraction for multi-property documents, validated against a known
57
+ address:
58
+
59
+ ```python
60
+ result = extract_from_pdf(
61
+ "multi_property.pdf",
62
+ property_name="サンプルレジデンス戸越公園",
63
+ known_address="東京都品川区",
64
+ other_property_names=["サンプルレジデンス東大井"],
65
+ )
66
+ ```
67
+
68
+ ### CLI
69
+
70
+ ```bash
71
+ chiban-extract extract document.pdf --json
72
+ chiban-extract extract https://example.com/document.pdf --known-address "東京都港区"
73
+ chiban-extract extract document.pdf --property-name "サンプルビル" --known-address "東京都港区"
74
+ ```
75
+
76
+ Exit codes: `0` success, `1` nothing found, `2` input error.
77
+
78
+ ### How it works
79
+
80
+ 1. PDF text is extracted with PyMuPDF and NFKC-normalized.
81
+ 2. A keyword-driven pass looks for an address right after 「住居表示」
82
+ (preferred — it identifies the building) and falls back to lot numbers
83
+ after 「所在地」/「地番」.
84
+ 3. A pattern-driven pass collects all address-like strings, ranking those
85
+ within 200 chars of an address keyword first.
86
+ 4. Candidates that contradict the known prefecture / city / ward are dropped.
87
+
88
+ ### License
89
+
90
+ MIT
91
+
92
+ ---
93
+
94
+ ## 日本語
95
+
96
+ ### 概要
97
+
98
+ 不動産関連文書(契約書・登記資料・パンフレット・プレスリリース等)に
99
+ 記載される所在地は、住居表示(`…1番1号`)・地番(`…788番地 他3筆`)・
100
+ 漢数字丁目・PDFテキスト層由来の空白/改行混入などフォーマットが安定しません。
101
+ 本ライブラリの正規表現は数百件の実文書で鍛えたもので、抽出まわりの
102
+ 周辺問題もまとめて解決します。
103
+
104
+ - **複数物件文書対応** — 物件名近傍にスコープした抽出と、類似名物件との
105
+ 混同警告。
106
+ - **既知住所との突合** — 都道府県/市/区トークンで候補を検証し、同一文書中の
107
+ 別当事者住所の誤抽出を防ぎます。
108
+ - **候補ランキング** — 住所らしき文字列を全件返し、キーワード
109
+ (住居表示/所在地/地番/住所)近傍のものを優先します。
110
+
111
+ ### インストール
112
+
113
+ ```bash
114
+ pip install chiban-extract
115
+ ```
116
+
117
+ Python 3.10 以上が必要です。依存は PyMuPDF のみです。
118
+
119
+ ### 使い方
120
+
121
+ ```python
122
+ from chiban_extract import extract_from_text, extract_from_pdf
123
+
124
+ result = extract_from_pdf("document.pdf", known_address="東京都港区")
125
+ if result.best:
126
+ print(result.best.address, result.best.kind) # jukyo (住居表示) / chiban (地番)
127
+ ```
128
+
129
+ CLI:
130
+
131
+ ```bash
132
+ chiban-extract extract document.pdf --json
133
+ chiban-extract extract document.pdf --known-address "東京都港区"
134
+ ```
135
+
136
+ ### 抽出ロジック
137
+
138
+ 1. PyMuPDF で全文抽出 → NFKC 正規化
139
+ 2. 「住居表示」直後の住所を優先抽出(建物を一意に示すため)、無ければ
140
+ 「所在地」「地番」直後の地番にフォールバック
141
+ 3. 汎用住所パターンで全候補を収集し、キーワード近傍(200字以内)を優先
142
+ 4. 既知住所の都道府県/市/区/町トークンと矛盾する候補を除外
143
+
144
+ ### ライセンス
145
+
146
+ MIT
@@ -0,0 +1,51 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "chiban-extract"
7
+ version = "0.1.0"
8
+ description = "Extract Japanese street addresses (住居表示) and land lot numbers (地番) from text and PDFs"
9
+ readme = "README.md"
10
+ license = { text = "MIT" }
11
+ requires-python = ">=3.10"
12
+ authors = [{ name = "chiban-extract contributors" }]
13
+ keywords = ["japan", "japanese", "address", "chiban", "pdf", "text-extraction", "real-estate"]
14
+ classifiers = [
15
+ "Development Status :: 4 - Beta",
16
+ "Intended Audience :: Developers",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Natural Language :: Japanese",
19
+ "Natural Language :: English",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.12",
24
+ "Programming Language :: Python :: 3.13",
25
+ "Topic :: Text Processing",
26
+ ]
27
+ dependencies = ["pymupdf>=1.24"]
28
+
29
+ [project.optional-dependencies]
30
+ dev = ["pytest>=8", "ruff>=0.4"]
31
+
32
+ [project.scripts]
33
+ chiban-extract = "chiban_extract.cli:main"
34
+
35
+ [project.urls]
36
+ Homepage = "https://github.com/91st1213-blip/chiban-extract"
37
+ Issues = "https://github.com/91st1213-blip/chiban-extract/issues"
38
+
39
+ [tool.hatch.build.targets.wheel]
40
+ packages = ["src/chiban_extract"]
41
+
42
+ [tool.ruff]
43
+ line-length = 120
44
+ src = ["src", "tests"]
45
+
46
+ [tool.ruff.lint]
47
+ select = ["E", "F", "W", "I", "UP", "B"]
48
+
49
+ [tool.pytest.ini_options]
50
+ testpaths = ["tests"]
51
+ addopts = "-q"
@@ -0,0 +1,49 @@
1
+ """chiban-extract: Japanese street addresses & land lot numbers out of text and PDFs.
2
+
3
+ Quick start::
4
+
5
+ from chiban_extract import extract_from_pdf
6
+
7
+ result = extract_from_pdf("document.pdf", known_address="東京都港区")
8
+ print(result.best.address if result.best else "not found")
9
+ """
10
+
11
+ from .extract import (
12
+ extract_address_for_property,
13
+ extract_preferred_address,
14
+ find_address_candidates,
15
+ )
16
+ from .models import (
17
+ AddressCandidate,
18
+ AddressKind,
19
+ AddressMatch,
20
+ AddressTokens,
21
+ ExtractionResult,
22
+ PdfDocument,
23
+ )
24
+ from .normalize import canonicalize_address, normalize_text
25
+ from .pdf import read_pdf
26
+ from .pipeline import extract_from_pdf, extract_from_text
27
+ from .tokens import has_block_number, parse_address_tokens
28
+
29
+ __version__ = "0.1.0"
30
+
31
+ __all__ = [
32
+ "AddressCandidate",
33
+ "AddressKind",
34
+ "AddressMatch",
35
+ "AddressTokens",
36
+ "ExtractionResult",
37
+ "PdfDocument",
38
+ "canonicalize_address",
39
+ "extract_address_for_property",
40
+ "extract_from_pdf",
41
+ "extract_from_text",
42
+ "extract_preferred_address",
43
+ "find_address_candidates",
44
+ "has_block_number",
45
+ "normalize_text",
46
+ "parse_address_tokens",
47
+ "read_pdf",
48
+ "__version__",
49
+ ]
@@ -0,0 +1,76 @@
1
+ """Command-line interface.
2
+
3
+ chiban-extract extract <pdf-or-url> [--property-name N] [--known-address A] [--json]
4
+
5
+ Exit codes: 0 = success, 1 = nothing found, 2 = input/usage error.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import argparse
11
+ import json
12
+ import sys
13
+
14
+ from .pdf import read_pdf
15
+ from .pipeline import extract_from_text
16
+
17
+
18
+ def _print_json(obj: dict) -> None:
19
+ print(json.dumps(obj, ensure_ascii=False, indent=2))
20
+
21
+
22
+ def _cmd_extract(args: argparse.Namespace) -> int:
23
+ try:
24
+ doc = read_pdf(args.source)
25
+ except Exception as e:
26
+ print(f"error: could not read PDF: {e}", file=sys.stderr)
27
+ return 2
28
+ result = extract_from_text(
29
+ doc.text,
30
+ page_count=doc.page_count,
31
+ property_name=args.property_name,
32
+ known_address=args.known_address,
33
+ other_property_names=args.other_property or None,
34
+ )
35
+ if args.json:
36
+ _print_json(result.to_dict())
37
+ else:
38
+ if result.best:
39
+ print(f"best: {result.best.address} [{result.best.kind.value}]")
40
+ else:
41
+ print("best: (none)")
42
+ for c in result.candidates[:10]:
43
+ marker = "*" if result.best and c.address == result.best.address else " "
44
+ print(f" {marker} {c.address} [{c.kind.value}]"
45
+ f"{' (near keyword)' if c.near_keyword else ''}")
46
+ for w in result.warnings:
47
+ print(f"warning: {w}", file=sys.stderr)
48
+ return 0 if result.best else 1
49
+
50
+
51
+ def build_parser() -> argparse.ArgumentParser:
52
+ p = argparse.ArgumentParser(
53
+ prog="chiban-extract",
54
+ description="Extract Japanese addresses and land lot numbers (chiban) from PDFs",
55
+ )
56
+ sub = p.add_subparsers(dest="command", required=True)
57
+
58
+ pe = sub.add_parser("extract", help="extract addresses from a PDF (path or URL)")
59
+ pe.add_argument("source", help="PDF file path or http(s) URL")
60
+ pe.add_argument("--property-name", help="scope extraction near this property name")
61
+ pe.add_argument("--known-address", help="validate candidates against this known address")
62
+ pe.add_argument("--other-property", action="append", default=[],
63
+ help="other property names in the same document (repeatable)")
64
+ pe.add_argument("--json", action="store_true", help="output JSON")
65
+ pe.set_defaults(func=_cmd_extract)
66
+
67
+ return p
68
+
69
+
70
+ def main(argv: list[str] | None = None) -> int:
71
+ args = build_parser().parse_args(argv)
72
+ return args.func(args)
73
+
74
+
75
+ if __name__ == "__main__":
76
+ sys.exit(main())