quickquery 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,338 @@
1
+ Metadata-Version: 2.4
2
+ Name: quickquery
3
+ Version: 0.1.1
4
+ Summary: 自分用・非汎用
5
+ Requires-Python: >=3.12
6
+ Description-Content-Type: text/markdown
7
+ License-File: LICENSE
8
+ Requires-Dist: patchright>=1.40
9
+ Requires-Dist: playwright>=1.40
10
+ Requires-Dist: selectolax>=0.3
11
+ Requires-Dist: pyarrow>=23.0
12
+ Requires-Dist: camoufox>=0.4
13
+ Requires-Dist: loguru>=0.7
14
+ Requires-Dist: tqdm>=4.66
15
+
16
+ # QuickQuery
17
+
18
+ 自分用・非汎用
19
+
20
+ ## インストール
21
+ `uv add quickquery`
22
+
23
+ ※ `open_patchright` を使うとき:Google ChromeをPCにインストールしておく。
24
+ ※ `open_camoufox` を使うとき:`uv run camoufox fetch`
25
+
26
+ ## 実装機能
27
+
28
+ ### quickquery
29
+
30
+ - `quick_page(page: Page) -> QuickPage`
31
+ - `quick_element(page: Page, elem: ElementHandle | None) -> QuickElement`
32
+ - `quick_element_group(page: Page, elems: list[QuickElement]) -> QuickElementGroup`
33
+ - `quick_frame(page: Page, frame: Frame | None) -> QuickFrame`
34
+ - `quick_shadow_root(page: Page, host: ElementHandle | None) -> QuickShadowRoot`
35
+ - `quick_parser(parser: LexborHTMLParser) -> QuickParser`
36
+ - `quick_node(node: LexborNode | None) -> QuickNode`
37
+ - `quick_node_group(nodes: list[QuickNode]) -> QuickNodeGroup`
38
+ - `QuickPage`
39
+ - `QuickElement`
40
+ - `QuickElementGroup`
41
+ - `ElementScan`
42
+ - `QuickFrame`
43
+ - `QuickShadowRoot`
44
+ - `QuickParser`
45
+ - `QuickNode`
46
+ - `QuickNodeGroup`
47
+ - `NodeScan`
48
+
49
+ ### quickquery.utils
50
+
51
+ - `parse_html(path: Path) -> LexborHTMLParser | None`
52
+ - `meta_html(meta: Mapping[str, object | None]) -> str`
53
+ - `from_here(file: str) -> Callable[[str], Path]`
54
+ - `append_csv(path: Path, row: dict) -> None`
55
+ - `write_csv(path: Path, rows: list[dict]) -> None`
56
+ - `write_parquet(path: Path, rows: list[dict]) -> None`
57
+ - `hash_name(key: str) -> str`
58
+ - `write_text(path: Path, data: str) -> bool`
59
+ - `write_bytes(path: Path, data: bytes) -> bool`
60
+ - `save_log(path: Path, level: str = 'WARNING') -> None`
61
+ - `process_map[T, R](worker: Callable[[T], R], items: Iterable[T], workers: int | None = None, *, chunksize: int | None = None) -> list[R | None]`
62
+ - `glob_paths(dir_path: Path, pattern: str = '*.html') -> list[str]`
63
+ - `counter(start: int = 1) -> Iterator[int]`
64
+
65
+ ### quickquery.live
66
+
67
+ - `RecycleEvery`
68
+ - `PatchrightSession`
69
+ - `CamoufoxSession`
70
+ - `open_patchright(*, browser_options: dict | None = None, context_options: dict | None = None, recycle: RecycleEvery | None = None) -> PatchrightSession`
71
+ - `open_camoufox(*, browser_options: dict | None = None, context_options: dict | None = None, recycle: RecycleEvery | None = None) -> CamoufoxSession`
72
+ - `PatchrightSession.page() -> Page`
73
+ - `CamoufoxSession.page() -> Page`
74
+
75
+ `browser_options` / `context_options` は Playwright へ渡す起動オプション。`recycle` は quickquery の再生成間隔(`page()` 呼び出し回数ごとに独立して効く。省略時は再生成しない)。`page()` を呼ぶたびに内部カウントが 1 進む。
76
+
77
+ ## 使用例
78
+
79
+ ### crawl.py
80
+ ```python
81
+ from urllib.parse import urlencode
82
+
83
+ from quickquery import quick_page
84
+ from quickquery.live import RecycleEvery, open_patchright
85
+ from quickquery.utils import save_log, from_here, counter, write_csv
86
+
87
+ here = from_here(__file__)
88
+ save_log(here('log/crawling.log'))
89
+
90
+ with open_patchright(
91
+ browser_options={'channel': 'chrome', 'headless': False},
92
+ context_options={'viewport': {'width': 1920, 'height': 1080}},
93
+ recycle=RecycleEvery(browser=300, context=100, page=20),
94
+ ) as s:
95
+ page = s.page()
96
+ p = quick_page(page)
97
+ p.goto('https://home.katitas.jp/buyers_search')
98
+ prefecture_urls = p.ii('div ul li a[href^="https://home.katitas.jp/buyers_search/area"]').urls
99
+
100
+ n = len(prefecture_urls)
101
+ urls = []
102
+ for i, prefecture_url in enumerate(prefecture_urls):
103
+ print(f'prefecture_url {i}/{n - 1}')
104
+ for page_num in counter():
105
+ page = s.page()
106
+ p = quick_page(page)
107
+ if not p.goto(f'{prefecture_url}?{urlencode({"page": page_num})}', sleep_after=(0.5, 1)):
108
+ break
109
+ if not (bukken_elems := p.ii('ul li div a[href^="https://home.katitas.jp"]:has(p)')):
110
+ break
111
+ urls.extend(bukken_elems.urls)
112
+ write_csv(here('csv/urls.csv'), [{'url': url} for url in urls])
113
+ ```
114
+
115
+ ### scrape.py
116
+ ```python
117
+ from datetime import datetime, timezone
118
+ import time
119
+
120
+ import pandas as pd
121
+
122
+ from quickquery import quick_page
123
+ from quickquery.live import RecycleEvery, open_patchright
124
+ from quickquery.utils import (
125
+ save_log,
126
+ append_csv,
127
+ from_here,
128
+ meta_html,
129
+ hash_name,
130
+ write_text,
131
+ write_bytes,
132
+ )
133
+
134
+ here = from_here(__file__)
135
+ save_log(here('log/scraping.log'))
136
+
137
+ items = list(pd.read_csv(here('csv/urls.csv'))['url'].items())
138
+ n = len(items)
139
+
140
+ with open_patchright(
141
+ browser_options={'channel': 'chrome', 'headless': False},
142
+ context_options={'viewport': {'width': 1920, 'height': 1080}},
143
+ recycle=RecycleEvery(browser=300, context=100),
144
+ ) as s:
145
+ for url_index, request_url in items:
146
+ print(f'url_index {url_index}/{n - 1}')
147
+ page = s.page()
148
+ p = quick_page(page)
149
+ if not p.goto(request_url):
150
+ append_csv(here('csv/failed.csv'), {
151
+ 'url_index': url_index,
152
+ 'request_url': request_url,
153
+ 'reason': 'goto',
154
+ })
155
+ continue
156
+ html = meta_html({
157
+ 'quickquery:url_index': url_index,
158
+ 'quickquery:saved_at': datetime.now(timezone.utc),
159
+ 'quickquery:request_url': request_url,
160
+ 'quickquery:final_url': page.url,
161
+ }) + page.content()
162
+ if not write_text(here('html') / f'{hash_name(page.url)}.html', html):
163
+ append_csv(here('csv/failed.csv'), {
164
+ 'url_index': url_index,
165
+ 'request_url': request_url,
166
+ 'reason': 'write_text',
167
+ })
168
+
169
+ page.screenshot(path=here(f'media/{url_index}-full-page.png'), full_page=True)
170
+
171
+ elem_iframe = p.i('iframe[src^="https://home.katitas.jp"]')
172
+ elem_iframe.scroll_into_view()
173
+ time.sleep(3)
174
+ elem_iframe.screenshot(here(f'media/{url_index}-gmap.png'), isolate=True)
175
+
176
+ img_li_scan = p.ii('p.text-left').scan.m(r'画像をクリックすると拡大画像がご覧に').n('ul').ii('li').scan
177
+ img_li = img_li_scan.m(r'外観') or img_li_scan.m(r'^(?!.*間取).*')
178
+ img_url = img_li.i('a').url
179
+ if (body := p.bytes_at(img_url)):
180
+ write_bytes(here(f'media/{url_index}-img-desc.jpg'), body)
181
+
182
+ main_img_url = p.i('img.w-full.object-contain').src
183
+ if (body := p.bytes_at(main_img_url)):
184
+ write_bytes(here(f'media/{url_index}-img-main.jpg'), body)
185
+ ```
186
+
187
+ ### extract.py
188
+ ```python
189
+ from pathlib import Path
190
+
191
+ from quickquery import quick_parser
192
+ from quickquery.utils import from_here, glob_paths, parse_html, process_map, write_parquet
193
+
194
+ def main():
195
+ here = from_here(__file__)
196
+ html_paths = glob_paths(here('html'), '*.html')
197
+ results = [r for r in process_map(extract, html_paths) if r]
198
+ write_parquet(here('parquet/extract.parquet'), results)
199
+
200
+ def extract(file_path: str) -> dict | None:
201
+ if not (parser := parse_html(Path(file_path))):
202
+ return None
203
+ p = quick_parser(parser)
204
+ dt_scan = p.ii('dt').scan
205
+ dd_text = lambda pattern: dt_scan.m(pattern).n('dd').text
206
+ return {
207
+ 'url_index': p.i('meta[name="quickquery:url_index"]').attr('content'),
208
+ 'saved_at': p.i('meta[name="quickquery:saved_at"]').attr('content'),
209
+ 'request_url': p.i('meta[name="quickquery:request_url"]').attr('content'),
210
+ 'final_url': p.i('meta[name="quickquery:final_url"]').attr('content'),
211
+ 'ファイル名': Path(file_path).name,
212
+
213
+ '取り扱い店舗': p.ii('p').scan.m(r'取り扱い店舗').n('p').text,
214
+
215
+ '価格': dd_text(r'価格'),
216
+ '月々の支払い': dd_text(r'月々の支払い'),
217
+ '間取': dd_text(r'間取'),
218
+ '土地面積': dd_text(r'土地面積'),
219
+ '建物面積': dd_text(r'建物面積'),
220
+
221
+ '所在地': dd_text(r'所在地'),
222
+ '交通': dd_text(r'交通'),
223
+ '接道状況': dd_text(r'接道状況'),
224
+ '私道面積': dd_text(r'私道面積'),
225
+ 'セットバック': dd_text(r'セットバック'),
226
+ '建物構造': dd_text(r'建物構造'),
227
+ '国土法提出': dd_text(r'国土法提出'),
228
+ '駐車場': dd_text(r'駐車場'),
229
+ '車庫区分': dd_text(r'車庫区分'),
230
+ '都市計画': dd_text(r'都市計画'),
231
+ '物件種別': dd_text(r'物件種別'),
232
+ '建ぺい率 /容積率': dd_text(r'建ぺい率.*容積率'),
233
+ '土地権利': dd_text(r'土地権利'),
234
+ '地目': dd_text(r'地目'),
235
+ '築年月': dd_text(r'築年月'),
236
+ '取引態様': dd_text(r'取引態様'),
237
+ '引渡日(入居予定日)': dd_text(r'引渡日.*入居予定日'),
238
+ '用途地域': dd_text(r'用途地域'),
239
+ '現況': dd_text(r'現況'),
240
+ '設備・条件': dd_text(r'設備.*条件'),
241
+ '備考': dd_text(r'備考'),
242
+ '最寄りの学校': dd_text(r'最寄.*の学校'),
243
+ '物件番号': dd_text(r'物件番号'),
244
+ '情報更新日': dd_text(r'情報更新日'),
245
+ '次回更新予定日': dd_text(r'次回更新予定日'),
246
+
247
+ 'スタッフからのコメント': p.ii('div').scan.m(r'スタッフからのコメント').n('div').text,
248
+ '物件の魅力': p.ii('p').scan.m(r'物件の魅力').n('p').text,
249
+
250
+ 'img_desc': '\n'.join(p.ii('p.text-left').scan.m(r'画像をクリックすると拡大画像がご覧に').n('ul').ii('li').texts)
251
+ }
252
+
253
+ if __name__ == '__main__':
254
+ main()
255
+ ```
256
+
257
+ ### clean.ipynb
258
+ ```python
259
+ import re
260
+
261
+ import pandas as pd
262
+ ```
263
+ ```python
264
+ df_shikutyoson = pd.read_csv('./shikutyoson.csv')
265
+ cities = df_shikutyoson["市区町村"].dropna().sort_values(key=lambda x: x.str.len(), ascending=False)
266
+ shikutyoson_pattern = "|".join(cities.map(lambda x: re.escape(x)))
267
+ ```
268
+ ```python
269
+ df_raw = pd.read_parquet('parquet/extract.parquet')
270
+ df_raw = df_raw.apply(lambda x: x.fillna('').str.normalize('NFKC').str.strip())
271
+ ```
272
+ ```python
273
+ df = df_raw.sort_values('saved_at')[['url_index', 'saved_at', 'request_url', 'final_url']].copy()
274
+
275
+ df['事例種別'] = df_raw['物件種別'].str.contains(r'中古|土地').map({True: '中古売出'})
276
+ df['総額'] = (
277
+ df_raw['価格']
278
+ .str.extract(r'([,\d]+)\s*万円', expand=False)
279
+ .replace(',', '', regex=True)
280
+ .pipe(lambda s: pd.to_numeric(s, errors='coerce') * 10000)
281
+ )
282
+ df['土地面積'] = df_raw['土地面積'].str.extract(r'([\d\.]+)')
283
+ df['建物面積'] = df_raw['建物面積'].str.extract(r'([\d\.]+)')
284
+ df['建物種別'] = df_raw['物件種別'].map({'中古戸建': '戸建て', '中古マンション': 'マンション', '土地': '土地'})
285
+ df[['所在都道府県', '所在市', '所在字', '所在番地']] = df_raw['所在地'].str.extract(fr'^(京都府|.+?[都道府県])({shikutyoson_pattern})(\D*)(.*)')
286
+
287
+ s1 = (
288
+ df_raw['築年月']
289
+ .replace({r'元年': r'1年'}, regex=True)
290
+ .str.extract(r'(\d+)年', expand=False)
291
+ .pipe(lambda s: pd.to_numeric(s, errors='coerce'))
292
+ )
293
+ s2 = df_raw['築年月'].str[:2].map({'令和': 2018, '平成': 1988, '昭和': 1925, '大正': 1911, '明治': 1867})
294
+ df['建築年'] = s1 + s2
295
+
296
+ df['構造体'] = df_raw['建物構造'].str.extract(r'^(\S+)')
297
+ df['階層'] = df_raw['建物構造'].str.extract(r'(\d+)階')
298
+ df['リノベ内容'] = df_raw['備考'].str.extract(r'(?s)^(20\d{2}/.*?)\n\D')
299
+ df['間取'] = df_raw['間取']
300
+ df['成約年月'] = df_raw['現況'].map({'空': '販売中', '古家付': '販売中'})
301
+ df['私道負担'] = df_raw['私道面積']
302
+ df['接道'] = df_raw['接道状況']
303
+
304
+ s1 = df_raw['最寄りの学校'].str.extract(r'([^/\s【】・、(]+?小学校)', expand=False)
305
+ s2 = df_raw['物件の魅力'].str.extract(r'([^/\s【】・、(]+?小学校)', expand=False)
306
+ s3 = df_raw['備考'].str.extract(r'([^/\s【】・、(]+?小学校)', expand=False)
307
+ s4 = df_raw['img_desc'].str.extract(r'([^/\s【】・、(]+?小学校)', expand=False)
308
+ df['小学校'] = s1.fillna(s2).fillna(s3).fillna(s4)
309
+
310
+ s1 = df_raw['最寄りの学校'].str.extract(r'([^/\s【】・、(]+?中学校)', expand=False)
311
+ s2 = df_raw['物件の魅力'].str.extract(r'([^/\s【】・、(]+?中学校)', expand=False)
312
+ s3 = df_raw['備考'].str.extract(r'([^/\s【】・、(]+?中学校)', expand=False)
313
+ s4 = df_raw['img_desc'].str.extract(r'([^/\s【】・、(]+?中学校)', expand=False)
314
+ df['中学校'] = s1.fillna(s2).fillna(s3).fillna(s4)
315
+
316
+ df['周辺環境'] = df_raw['備考'].map(lambda x: '\n'.join(l for l in x.splitlines() if re.search(r'(?:\d分|\dm)$', l)))
317
+ df['都市計画'] = df_raw['都市計画']
318
+ df['用途地域'] = df_raw['用途地域']
319
+ df[['建ぺい率', '容積率']] = df_raw['建ぺい率 /容積率'].str.extract(r'(\d+%)\D*(\d+%)')
320
+ df['水道'] = df_raw['設備・条件'].str.extract(r'(公営水道|上水道)')
321
+ df['下水'] = df_raw['設備・条件'].str.extract(r'(本下水|個別浄化槽|汲取|下水道)')
322
+ df['ガス'] = df_raw['設備・条件'].str.extract(r'(個別LPG|集中LPG|都市ガス|プロパンガス|オール電化)')
323
+ df['契約態様'] = df_raw['取引態様']
324
+ df['問合せ先'] = df_raw['取り扱い店舗']
325
+ df['駐車場'] = df_raw['駐車場']
326
+ df['交通'] = df_raw['交通']
327
+ df['物件の特徴'] = df_raw['物件の魅力']
328
+ df['仕様'] = df_raw['設備・条件']
329
+
330
+ df['土地権利'] = df_raw['土地権利']
331
+ df['地目'] = df_raw['地目']
332
+ df['引渡日(入居予定日)'] = df_raw['引渡日(入居予定日)']
333
+ df['物件番号'] = df_raw['物件番号']
334
+ df['情報更新日'] = df_raw['情報更新日']
335
+ ```
336
+ ```python
337
+ df.to_clipboard(index=False)
338
+ ```
@@ -0,0 +1,8 @@
1
+ quickquery/__init__.py,sha256=P8EEOHs2bi-x_TrHPvJ-J0eK-VvmXQAzoM2wOztWNQw,902
2
+ quickquery/core.py,sha256=C9v-k2eEMUbsxwPTHg64EpeOtAipa2sZQwYtneSfJ7M,23848
3
+ quickquery/live.py,sha256=ucua5y6JGYEDk9e1G9sOVVgTDujGyLzvcJ8zZEfvbfk,5868
4
+ quickquery/utils.py,sha256=xnLyvM20hyBu54guABnEEkbAQnmqA72SOGTc7kOBpV0,7984
5
+ quickquery-0.1.1.dist-info/licenses/LICENSE,sha256=q8ED812OTMMCwQSdHvtx6PSnmtRIotcIjKPHMmVseQI,1096
6
+ quickquery-0.1.1.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
7
+ quickquery-0.1.1.dist-info/METADATA,sha256=8rfPA14qXVFvftEJFdApgCvc5T9_l7Y9m2L2s11eBzs,13726
8
+ quickquery-0.1.1.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: flit 3.12.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Nishizawa Takamasa
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.