litescrape 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- litescrape-0.1.1/.gitignore +218 -0
- litescrape-0.1.1/.python-version +1 -0
- litescrape-0.1.1/LICENSE +21 -0
- litescrape-0.1.1/PKG-INFO +327 -0
- litescrape-0.1.1/README.md +312 -0
- litescrape-0.1.1/litescrape/__init__.py +41 -0
- litescrape-0.1.1/litescrape/browser.py +180 -0
- litescrape-0.1.1/litescrape/core.py +672 -0
- litescrape-0.1.1/litescrape/utils.py +199 -0
- litescrape-0.1.1/main.py +6 -0
- litescrape-0.1.1/pyproject.toml +15 -0
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[codz]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py.cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
.pybuilder/
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
88
|
+
# .python-version
|
|
89
|
+
|
|
90
|
+
# pipenv
|
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
94
|
+
# install all needed dependencies.
|
|
95
|
+
# Pipfile.lock
|
|
96
|
+
|
|
97
|
+
# UV
|
|
98
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
|
99
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
100
|
+
# commonly ignored for libraries.
|
|
101
|
+
# uv.lock
|
|
102
|
+
|
|
103
|
+
# poetry
|
|
104
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
105
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
106
|
+
# commonly ignored for libraries.
|
|
107
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
108
|
+
# poetry.lock
|
|
109
|
+
# poetry.toml
|
|
110
|
+
|
|
111
|
+
# pdm
|
|
112
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
113
|
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
|
114
|
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
|
115
|
+
# pdm.lock
|
|
116
|
+
# pdm.toml
|
|
117
|
+
.pdm-python
|
|
118
|
+
.pdm-build/
|
|
119
|
+
|
|
120
|
+
# pixi
|
|
121
|
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
|
122
|
+
# pixi.lock
|
|
123
|
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
|
124
|
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
|
125
|
+
.pixi
|
|
126
|
+
|
|
127
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
128
|
+
__pypackages__/
|
|
129
|
+
|
|
130
|
+
# Celery stuff
|
|
131
|
+
celerybeat-schedule
|
|
132
|
+
celerybeat.pid
|
|
133
|
+
|
|
134
|
+
# Redis
|
|
135
|
+
*.rdb
|
|
136
|
+
*.aof
|
|
137
|
+
*.pid
|
|
138
|
+
|
|
139
|
+
# RabbitMQ
|
|
140
|
+
mnesia/
|
|
141
|
+
rabbitmq/
|
|
142
|
+
rabbitmq-data/
|
|
143
|
+
|
|
144
|
+
# ActiveMQ
|
|
145
|
+
activemq-data/
|
|
146
|
+
|
|
147
|
+
# SageMath parsed files
|
|
148
|
+
*.sage.py
|
|
149
|
+
|
|
150
|
+
# Environments
|
|
151
|
+
.env
|
|
152
|
+
.envrc
|
|
153
|
+
.venv
|
|
154
|
+
env/
|
|
155
|
+
venv/
|
|
156
|
+
ENV/
|
|
157
|
+
env.bak/
|
|
158
|
+
venv.bak/
|
|
159
|
+
|
|
160
|
+
# Spyder project settings
|
|
161
|
+
.spyderproject
|
|
162
|
+
.spyproject
|
|
163
|
+
|
|
164
|
+
# Rope project settings
|
|
165
|
+
.ropeproject
|
|
166
|
+
|
|
167
|
+
# mkdocs documentation
|
|
168
|
+
/site
|
|
169
|
+
|
|
170
|
+
# mypy
|
|
171
|
+
.mypy_cache/
|
|
172
|
+
.dmypy.json
|
|
173
|
+
dmypy.json
|
|
174
|
+
|
|
175
|
+
# Pyre type checker
|
|
176
|
+
.pyre/
|
|
177
|
+
|
|
178
|
+
# pytype static type analyzer
|
|
179
|
+
.pytype/
|
|
180
|
+
|
|
181
|
+
# Cython debug symbols
|
|
182
|
+
cython_debug/
|
|
183
|
+
|
|
184
|
+
# PyCharm
|
|
185
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
186
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
187
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
188
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
189
|
+
# .idea/
|
|
190
|
+
|
|
191
|
+
# Abstra
|
|
192
|
+
# Abstra is an AI-powered process automation framework.
|
|
193
|
+
# Ignore directories containing user credentials, local state, and settings.
|
|
194
|
+
# Learn more at https://abstra.io/docs
|
|
195
|
+
.abstra/
|
|
196
|
+
|
|
197
|
+
# Visual Studio Code
|
|
198
|
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
|
199
|
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
|
200
|
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
|
201
|
+
# you could uncomment the following to ignore the entire vscode folder
|
|
202
|
+
# .vscode/
|
|
203
|
+
# Temporary file for partial code execution
|
|
204
|
+
tempCodeRunnerFile.py
|
|
205
|
+
|
|
206
|
+
# Ruff stuff:
|
|
207
|
+
.ruff_cache/
|
|
208
|
+
|
|
209
|
+
# PyPI configuration file
|
|
210
|
+
.pypirc
|
|
211
|
+
|
|
212
|
+
# Marimo
|
|
213
|
+
marimo/_static/
|
|
214
|
+
marimo/_lsp/
|
|
215
|
+
__marimo__/
|
|
216
|
+
|
|
217
|
+
# Streamlit
|
|
218
|
+
.streamlit/secrets.toml
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
litescrape-0.1.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Nishizawa Takamasa
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: litescrape
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: 自分用・非汎用
|
|
5
|
+
Requires-Python: >=3.12
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Requires-Dist: patchright>=1.40
|
|
9
|
+
Requires-Dist: playwright>=1.40
|
|
10
|
+
Requires-Dist: selectolax>=0.3
|
|
11
|
+
Requires-Dist: pyarrow>=23.0
|
|
12
|
+
Requires-Dist: camoufox>=0.4
|
|
13
|
+
Requires-Dist: loguru>=0.7
|
|
14
|
+
Requires-Dist: tqdm>=4.66
|
|
15
|
+
|
|
16
|
+
# LiteScrape
|
|
17
|
+
|
|
18
|
+
自分用・非汎用
|
|
19
|
+
|
|
20
|
+
## インストール
|
|
21
|
+
`uv add litescrape`
|
|
22
|
+
|
|
23
|
+
※ `run_patchright` を使うとき:Google ChromeをPCにインストールしておく。
|
|
24
|
+
※ `run_camoufox` を使うとき:`uv run camoufox fetch`
|
|
25
|
+
|
|
26
|
+
## 実装機能
|
|
27
|
+
|
|
28
|
+
### litescrape
|
|
29
|
+
|
|
30
|
+
- `LitePage`
|
|
31
|
+
- `LiteElement`
|
|
32
|
+
- `LiteElementGroup`
|
|
33
|
+
- `ElementScan`
|
|
34
|
+
- `LiteFrame`
|
|
35
|
+
- `LiteShadowRoot`
|
|
36
|
+
- `LiteParser`
|
|
37
|
+
- `LiteNode`
|
|
38
|
+
- `LiteNodeGroup`
|
|
39
|
+
- `NodeScan`
|
|
40
|
+
|
|
41
|
+
### litescrape.utils
|
|
42
|
+
|
|
43
|
+
- `parse_html(path: Path) -> LexborHTMLParser | None`
|
|
44
|
+
- `meta_html(meta: Mapping[str, object | None]) -> str`
|
|
45
|
+
- `from_here(file: str) -> Callable[[str], Path]`
|
|
46
|
+
- `append_csv(path: Path, row: dict) -> None`
|
|
47
|
+
- `write_csv(path: Path, rows: list[dict]) -> None`
|
|
48
|
+
- `write_parquet(path: Path, rows: list[dict]) -> None`
|
|
49
|
+
- `hash_name(key: str) -> str`
|
|
50
|
+
- `write_text(path: Path, data: str) -> bool`
|
|
51
|
+
- `write_bytes(path: Path, data: bytes) -> bool`
|
|
52
|
+
- `save_log(path: Path, level: str = 'WARNING') -> None`
|
|
53
|
+
- `process_map[T, R](worker: Callable[[T], R], items: Iterable[T], workers: int | None = None, *, chunksize: int | None = None) -> list[R | None]`
|
|
54
|
+
- `glob_paths(dir_path: Path, pattern: str = '*.html') -> list[str]`
|
|
55
|
+
- `counter(start: int = 1) -> Iterator[int]`
|
|
56
|
+
|
|
57
|
+
### litescrape.browser
|
|
58
|
+
|
|
59
|
+
- `Span`
|
|
60
|
+
- `run_patchright(*, browser: dict | None = None, context: dict | None = None, span: Span | None = None) -> PatchrightRunner`
|
|
61
|
+
- `run_camoufox(*, browser: dict | None = None, context: dict | None = None, span: Span | None = None) -> CamoufoxRunner`
|
|
62
|
+
- `PatchrightRunner.page() -> Page` / `CamoufoxRunner.page() -> Page`
|
|
63
|
+
|
|
64
|
+
`browser` / `context` は Playwright へ渡す起動オプション。`span` は litescrape の再生成間隔(`page()` 呼び出し回数ごとに独立して効く。省略時は再生成しない)。`page()` を呼ぶたびに内部カウントが 1 進む。
|
|
65
|
+
|
|
66
|
+
## 使用例
|
|
67
|
+
|
|
68
|
+
### crawl.py
|
|
69
|
+
```python
|
|
70
|
+
from urllib.parse import urlencode
|
|
71
|
+
|
|
72
|
+
from litescrape import lite_page
|
|
73
|
+
from litescrape.browser import Span, run_patchright
|
|
74
|
+
from litescrape.utils import save_log, from_here, counter, write_csv
|
|
75
|
+
|
|
76
|
+
here = from_here(__file__)
|
|
77
|
+
save_log(here('log/crawling.log'))
|
|
78
|
+
|
|
79
|
+
with run_patchright(
|
|
80
|
+
browser={'channel': 'chrome', 'headless': False},
|
|
81
|
+
context={'viewport': {'width': 1920, 'height': 1080}},
|
|
82
|
+
span=Span(browser=300, context=100, page=20),
|
|
83
|
+
) as pr:
|
|
84
|
+
page = s.page()
|
|
85
|
+
p = lite_page(page)
|
|
86
|
+
p.goto('https://home.katitas.jp/buyers_search')
|
|
87
|
+
prefecture_urls = p.ii('div ul li a[href^="https://home.katitas.jp/buyers_search/area"]').urls
|
|
88
|
+
|
|
89
|
+
n = len(prefecture_urls)
|
|
90
|
+
urls = []
|
|
91
|
+
for i, prefecture_url in enumerate(prefecture_urls):
|
|
92
|
+
print(f'prefecture_url {i}/{n - 1}')
|
|
93
|
+
for page_num in counter():
|
|
94
|
+
page = s.page()
|
|
95
|
+
p = lite_page(page)
|
|
96
|
+
if not p.goto(f'{prefecture_url}?{urlencode({"page": page_num})}', sleep_after=(0.5, 1)):
|
|
97
|
+
break
|
|
98
|
+
if not (bukken_elems := p.ii('ul li div a[href^="https://home.katitas.jp"]:has(p)')):
|
|
99
|
+
break
|
|
100
|
+
urls.extend(bukken_elems.urls)
|
|
101
|
+
write_csv(here('csv/urls.csv'), [{'url': url} for url in urls])
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### scrape.py
|
|
105
|
+
```python
|
|
106
|
+
from datetime import datetime, timezone
|
|
107
|
+
import time
|
|
108
|
+
|
|
109
|
+
import pandas as pd
|
|
110
|
+
|
|
111
|
+
from litescrape import lite_page
|
|
112
|
+
from litescrape.browser import Span, run_patchright
|
|
113
|
+
from litescrape.utils import (
|
|
114
|
+
save_log,
|
|
115
|
+
append_csv,
|
|
116
|
+
from_here,
|
|
117
|
+
meta_html,
|
|
118
|
+
hash_name,
|
|
119
|
+
write_text,
|
|
120
|
+
write_bytes,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
here = from_here(__file__)
|
|
124
|
+
save_log(here('log/scraping.log'))
|
|
125
|
+
|
|
126
|
+
items = list(pd.read_csv(here('csv/urls.csv'))['url'].items())
|
|
127
|
+
n = len(items)
|
|
128
|
+
|
|
129
|
+
with run_patchright(
|
|
130
|
+
browser={'channel': 'chrome', 'headless': False},
|
|
131
|
+
context={'viewport': {'width': 1920, 'height': 1080}},
|
|
132
|
+
span=Span(browser=300, context=100),
|
|
133
|
+
) as pr:
|
|
134
|
+
for url_index, request_url in items:
|
|
135
|
+
print(f'url_index {url_index}/{n - 1}')
|
|
136
|
+
page = s.page()
|
|
137
|
+
p = lite_page(page)
|
|
138
|
+
if not p.goto(request_url):
|
|
139
|
+
append_csv(here('csv/failed.csv'), {
|
|
140
|
+
'url_index': url_index,
|
|
141
|
+
'request_url': request_url,
|
|
142
|
+
'reason': 'goto',
|
|
143
|
+
})
|
|
144
|
+
continue
|
|
145
|
+
html = meta_html({
|
|
146
|
+
'litescrape:url_index': url_index,
|
|
147
|
+
'litescrape:saved_at': datetime.now(timezone.utc),
|
|
148
|
+
'litescrape:request_url': request_url,
|
|
149
|
+
'litescrape:final_url': page.url,
|
|
150
|
+
}) + page.content()
|
|
151
|
+
if not write_text(here('html') / f'{hash_name(page.url)}.html', html):
|
|
152
|
+
append_csv(here('csv/failed.csv'), {
|
|
153
|
+
'url_index': url_index,
|
|
154
|
+
'request_url': request_url,
|
|
155
|
+
'reason': 'write_text',
|
|
156
|
+
})
|
|
157
|
+
|
|
158
|
+
page.screenshot(path=here(f'media/{url_index}-full-page.png'), full_page=True)
|
|
159
|
+
|
|
160
|
+
elem_iframe = p.i('iframe[src^="https://home.katitas.jp"]')
|
|
161
|
+
elem_iframe.scroll_into_view()
|
|
162
|
+
time.sleep(3)
|
|
163
|
+
elem_iframe.screenshot(here(f'media/{url_index}-gmap.png'), isolate=True)
|
|
164
|
+
|
|
165
|
+
img_li_scan = p.ii('p.text-left').scan.m(r'画像をクリックすると拡大画像がご覧に').n('ul').ii('li').scan
|
|
166
|
+
img_li = img_li_scan.m(r'外観') or img_li_scan.m(r'^(?!.*間取).*')
|
|
167
|
+
img_url = img_li.i('a').url
|
|
168
|
+
if (body := p.bytes_at(img_url)):
|
|
169
|
+
write_bytes(here(f'media/{url_index}-img-desc.jpg'), body)
|
|
170
|
+
|
|
171
|
+
main_img_url = p.i('img.w-full.object-contain').src
|
|
172
|
+
if (body := p.bytes_at(main_img_url)):
|
|
173
|
+
write_bytes(here(f'media/{url_index}-img-main.jpg'), body)
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
### extract.py
|
|
177
|
+
```python
|
|
178
|
+
from pathlib import Path
|
|
179
|
+
|
|
180
|
+
from litescrape import lite_parser
|
|
181
|
+
from litescrape.utils import from_here, glob_paths, parse_html, process_map, write_parquet
|
|
182
|
+
|
|
183
|
+
def main():
|
|
184
|
+
here = from_here(__file__)
|
|
185
|
+
html_paths = glob_paths(here('html'), '*.html')
|
|
186
|
+
results = [r for r in process_map(extract, html_paths) if r]
|
|
187
|
+
write_parquet(here('parquet/extract.parquet'), results)
|
|
188
|
+
|
|
189
|
+
def extract(file_path: str) -> dict | None:
|
|
190
|
+
if not (parser := parse_html(Path(file_path))):
|
|
191
|
+
return None
|
|
192
|
+
p = lite_parser(parser)
|
|
193
|
+
dt_scan = p.ii('dt').scan
|
|
194
|
+
dd_text = lambda pattern: dt_scan.m(pattern).n('dd').text
|
|
195
|
+
return {
|
|
196
|
+
'url_index': p.i('meta[name="litescrape:url_index"]').attr('content'),
|
|
197
|
+
'saved_at': p.i('meta[name="litescrape:saved_at"]').attr('content'),
|
|
198
|
+
'request_url': p.i('meta[name="litescrape:request_url"]').attr('content'),
|
|
199
|
+
'final_url': p.i('meta[name="litescrape:final_url"]').attr('content'),
|
|
200
|
+
'ファイル名': Path(file_path).name,
|
|
201
|
+
|
|
202
|
+
'取り扱い店舗': p.ii('p').scan.m(r'取り扱い店舗').n('p').text,
|
|
203
|
+
|
|
204
|
+
'価格': dd_text(r'価格'),
|
|
205
|
+
'月々の支払い': dd_text(r'月々の支払い'),
|
|
206
|
+
'間取': dd_text(r'間取'),
|
|
207
|
+
'土地面積': dd_text(r'土地面積'),
|
|
208
|
+
'建物面積': dd_text(r'建物面積'),
|
|
209
|
+
|
|
210
|
+
'所在地': dd_text(r'所在地'),
|
|
211
|
+
'交通': dd_text(r'交通'),
|
|
212
|
+
'接道状況': dd_text(r'接道状況'),
|
|
213
|
+
'私道面積': dd_text(r'私道面積'),
|
|
214
|
+
'セットバック': dd_text(r'セットバック'),
|
|
215
|
+
'建物構造': dd_text(r'建物構造'),
|
|
216
|
+
'国土法提出': dd_text(r'国土法提出'),
|
|
217
|
+
'駐車場': dd_text(r'駐車場'),
|
|
218
|
+
'車庫区分': dd_text(r'車庫区分'),
|
|
219
|
+
'都市計画': dd_text(r'都市計画'),
|
|
220
|
+
'物件種別': dd_text(r'物件種別'),
|
|
221
|
+
'建ぺい率 /容積率': dd_text(r'建ぺい率.*容積率'),
|
|
222
|
+
'土地権利': dd_text(r'土地権利'),
|
|
223
|
+
'地目': dd_text(r'地目'),
|
|
224
|
+
'築年月': dd_text(r'築年月'),
|
|
225
|
+
'取引態様': dd_text(r'取引態様'),
|
|
226
|
+
'引渡日(入居予定日)': dd_text(r'引渡日.*入居予定日'),
|
|
227
|
+
'用途地域': dd_text(r'用途地域'),
|
|
228
|
+
'現況': dd_text(r'現況'),
|
|
229
|
+
'設備・条件': dd_text(r'設備.*条件'),
|
|
230
|
+
'備考': dd_text(r'備考'),
|
|
231
|
+
'最寄りの学校': dd_text(r'最寄.*の学校'),
|
|
232
|
+
'物件番号': dd_text(r'物件番号'),
|
|
233
|
+
'情報更新日': dd_text(r'情報更新日'),
|
|
234
|
+
'次回更新予定日': dd_text(r'次回更新予定日'),
|
|
235
|
+
|
|
236
|
+
'スタッフからのコメント': p.ii('div').scan.m(r'スタッフからのコメント').n('div').text,
|
|
237
|
+
'物件の魅力': p.ii('p').scan.m(r'物件の魅力').n('p').text,
|
|
238
|
+
|
|
239
|
+
'img_desc': '\n'.join(p.ii('p.text-left').scan.m(r'画像をクリックすると拡大画像がご覧に').n('ul').ii('li').texts)
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
if __name__ == '__main__':
|
|
243
|
+
main()
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
### clean.ipynb
|
|
247
|
+
```python
|
|
248
|
+
import re
|
|
249
|
+
|
|
250
|
+
import pandas as pd
|
|
251
|
+
```
|
|
252
|
+
```python
|
|
253
|
+
df_shikutyoson = pd.read_csv('./shikutyoson.csv')
|
|
254
|
+
cities = df_shikutyoson["市区町村"].dropna().sort_values(key=lambda x: x.str.len(), ascending=False)
|
|
255
|
+
shikutyoson_pattern = "|".join(cities.map(lambda x: re.escape(x)))
|
|
256
|
+
```
|
|
257
|
+
```python
|
|
258
|
+
df_raw = pd.read_parquet('parquet/extract.parquet')
|
|
259
|
+
df_raw = df_raw.apply(lambda x: x.fillna('').str.normalize('NFKC').str.strip())
|
|
260
|
+
```
|
|
261
|
+
```python
|
|
262
|
+
df = df_raw.sort_values('saved_at')[['url_index', 'saved_at', 'request_url', 'final_url']].copy()
|
|
263
|
+
|
|
264
|
+
df['事例種別'] = df_raw['物件種別'].str.contains(r'中古|土地').map({True: '中古売出'})
|
|
265
|
+
df['総額'] = (
|
|
266
|
+
df_raw['価格']
|
|
267
|
+
.str.extract(r'([,\d]+)\s*万円', expand=False)
|
|
268
|
+
.replace(',', '', regex=True)
|
|
269
|
+
.pipe(lambda s: pd.to_numeric(s, errors='coerce') * 10000)
|
|
270
|
+
)
|
|
271
|
+
df['土地面積'] = df_raw['土地面積'].str.extract(r'([\d\.]+)')
|
|
272
|
+
df['建物面積'] = df_raw['建物面積'].str.extract(r'([\d\.]+)')
|
|
273
|
+
df['建物種別'] = df_raw['物件種別'].map({'中古戸建': '戸建て', '中古マンション': 'マンション', '土地': '土地'})
|
|
274
|
+
df[['所在都道府県', '所在市', '所在字', '所在番地']] = df_raw['所在地'].str.extract(fr'^(京都府|.+?[都道府県])({shikutyoson_pattern})(\D*)(.*)')
|
|
275
|
+
|
|
276
|
+
s1 = (
|
|
277
|
+
df_raw['築年月']
|
|
278
|
+
.replace({r'元年': r'1年'}, regex=True)
|
|
279
|
+
.str.extract(r'(\d+)年', expand=False)
|
|
280
|
+
.pipe(lambda s: pd.to_numeric(s, errors='coerce'))
|
|
281
|
+
)
|
|
282
|
+
s2 = df_raw['築年月'].str[:2].map({'令和': 2018, '平成': 1988, '昭和': 1925, '大正': 1911, '明治': 1867})
|
|
283
|
+
df['建築年'] = s1 + s2
|
|
284
|
+
|
|
285
|
+
df['構造体'] = df_raw['建物構造'].str.extract(r'^(\S+)')
|
|
286
|
+
df['階層'] = df_raw['建物構造'].str.extract(r'(\d+)階')
|
|
287
|
+
df['リノベ内容'] = df_raw['備考'].str.extract(r'(?s)^(20\d{2}/.*?)\n\D')
|
|
288
|
+
df['間取'] = df_raw['間取']
|
|
289
|
+
df['成約年月'] = df_raw['現況'].map({'空': '販売中', '古家付': '販売中'})
|
|
290
|
+
df['私道負担'] = df_raw['私道面積']
|
|
291
|
+
df['接道'] = df_raw['接道状況']
|
|
292
|
+
|
|
293
|
+
s1 = df_raw['最寄りの学校'].str.extract(r'([^/\s【】・、(]+?小学校)', expand=False)
|
|
294
|
+
s2 = df_raw['物件の魅力'].str.extract(r'([^/\s【】・、(]+?小学校)', expand=False)
|
|
295
|
+
s3 = df_raw['備考'].str.extract(r'([^/\s【】・、(]+?小学校)', expand=False)
|
|
296
|
+
s4 = df_raw['img_desc'].str.extract(r'([^/\s【】・、(]+?小学校)', expand=False)
|
|
297
|
+
df['小学校'] = s1.fillna(s2).fillna(s3).fillna(s4)
|
|
298
|
+
|
|
299
|
+
s1 = df_raw['最寄りの学校'].str.extract(r'([^/\s【】・、(]+?中学校)', expand=False)
|
|
300
|
+
s2 = df_raw['物件の魅力'].str.extract(r'([^/\s【】・、(]+?中学校)', expand=False)
|
|
301
|
+
s3 = df_raw['備考'].str.extract(r'([^/\s【】・、(]+?中学校)', expand=False)
|
|
302
|
+
s4 = df_raw['img_desc'].str.extract(r'([^/\s【】・、(]+?中学校)', expand=False)
|
|
303
|
+
df['中学校'] = s1.fillna(s2).fillna(s3).fillna(s4)
|
|
304
|
+
|
|
305
|
+
df['周辺環境'] = df_raw['備考'].map(lambda x: '\n'.join(l for l in x.splitlines() if re.search(r'(?:\d分|\dm)$', l)))
|
|
306
|
+
df['都市計画'] = df_raw['都市計画']
|
|
307
|
+
df['用途地域'] = df_raw['用途地域']
|
|
308
|
+
df[['建ぺい率', '容積率']] = df_raw['建ぺい率 /容積率'].str.extract(r'(\d+%)\D*(\d+%)')
|
|
309
|
+
df['水道'] = df_raw['設備・条件'].str.extract(r'(公営水道|上水道)')
|
|
310
|
+
df['下水'] = df_raw['設備・条件'].str.extract(r'(本下水|個別浄化槽|汲取|下水道)')
|
|
311
|
+
df['ガス'] = df_raw['設備・条件'].str.extract(r'(個別LPG|集中LPG|都市ガス|プロパンガス|オール電化)')
|
|
312
|
+
df['契約態様'] = df_raw['取引態様']
|
|
313
|
+
df['問合せ先'] = df_raw['取り扱い店舗']
|
|
314
|
+
df['駐車場'] = df_raw['駐車場']
|
|
315
|
+
df['交通'] = df_raw['交通']
|
|
316
|
+
df['物件の特徴'] = df_raw['物件の魅力']
|
|
317
|
+
df['仕様'] = df_raw['設備・条件']
|
|
318
|
+
|
|
319
|
+
df['土地権利'] = df_raw['土地権利']
|
|
320
|
+
df['地目'] = df_raw['地目']
|
|
321
|
+
df['引渡日(入居予定日)'] = df_raw['引渡日(入居予定日)']
|
|
322
|
+
df['物件番号'] = df_raw['物件番号']
|
|
323
|
+
df['情報更新日'] = df_raw['情報更新日']
|
|
324
|
+
```
|
|
325
|
+
```python
|
|
326
|
+
df.to_clipboard(index=False)
|
|
327
|
+
```
|