eksi-scraper 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eksi_scraper-0.1.0/.gitignore +10 -0
- eksi_scraper-0.1.0/CLAUDE.md +38 -0
- eksi_scraper-0.1.0/LICENSE +21 -0
- eksi_scraper-0.1.0/PKG-INFO +59 -0
- eksi_scraper-0.1.0/README.md +44 -0
- eksi_scraper-0.1.0/eksisozluk_scraper/__init__.py +0 -0
- eksi_scraper-0.1.0/eksisozluk_scraper/__main__.py +3 -0
- eksi_scraper-0.1.0/eksisozluk_scraper/cli.py +123 -0
- eksi_scraper-0.1.0/eksisozluk_scraper/data_writer.py +55 -0
- eksi_scraper-0.1.0/eksisozluk_scraper/scraper.py +139 -0
- eksi_scraper-0.1.0/pyproject.toml +29 -0
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Setup
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip3 install -r requirements.txt
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Running the scraper
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
# Run from the src/ directory
|
|
15
|
+
cd src
|
|
16
|
+
|
|
17
|
+
# Scrape specific threads
|
|
18
|
+
python3 main.py -t murat-kurum--2582131 ekrem-imamoglu--2577439 -o json
|
|
19
|
+
|
|
20
|
+
# Scrape from a file (one thread slug per line)
|
|
21
|
+
python3 main.py -f threads.txt -o csv
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Thread slugs are the URL path segment after `eksisozluk.com/` and before any `?` (e.g., `murat-kurum--2582131`).
|
|
25
|
+
|
|
26
|
+
Output files are written to the working directory where the script is run. Logs go to `eksisozluk_scraper.log`.
|
|
27
|
+
|
|
28
|
+
## Architecture
|
|
29
|
+
|
|
30
|
+
Three modules in `src/`:
|
|
31
|
+
|
|
32
|
+
- **`main.py`** — Entry point. Parses CLI args, creates one async task per thread, runs them concurrently via `asyncio.gather`. Uses `curl_cffi` with Chrome browser impersonation (`impersonate="chrome124"`) to bypass bot detection.
|
|
33
|
+
|
|
34
|
+
- **`eksisozluk_scraper.py`** — `EksiSozlukScraper` class. First fetches the thread's first page to read `data-pagecount` from the `.pager` div, then concurrently scrapes all pages using an `asyncio.Semaphore` (default: 15 concurrent requests). BeautifulSoup parsing is offloaded to a `ThreadPoolExecutor` to avoid blocking the event loop. Page requests retry with exponential backoff (up to 8 tries / 300s) via the `backoff` library.
|
|
35
|
+
|
|
36
|
+
- **`data_writer.py`** — `DataWriter` static class. Writes scraped entries (Content, Author, Date Created, Last Changed) to CSV or JSON using `aiofiles` for async I/O.
|
|
37
|
+
|
|
38
|
+
Each scraped entry dict has keys: `Content`, `Author`, `Date Created`, `Last Changed`.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Ibrahim Berkay Ceylan
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: eksi-scraper
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: asynchronously scrapes eksisozluk threads and exports to csv or json
|
|
5
|
+
Project-URL: Homepage, https://github.com/iberkayC/eksi-scraper
|
|
6
|
+
Author-email: Ibrahim Berkay Ceylan <ceylaniberkay@gmail.com>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Requires-Dist: backoff>=2.2
|
|
11
|
+
Requires-Dist: beautifulsoup4>=4.12
|
|
12
|
+
Requires-Dist: curl-cffi>=0.7
|
|
13
|
+
Requires-Dist: lxml>=5.1
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
|
|
16
|
+
# eksi-scraper
|
|
17
|
+
|
|
18
|
+
asynchronously scrapes eksisozluk threads in python, and puts entries in a csv or json file, named after the thread. intended for educational purposes only.
|
|
19
|
+
|
|
20
|
+
## installation
|
|
21
|
+
|
|
22
|
+
```
|
|
23
|
+
uv pip install eksi-scraper
|
|
24
|
+
```
|
|
25
|
+
or with pip:
|
|
26
|
+
```
|
|
27
|
+
pip install eksi-scraper
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## usage
|
|
31
|
+
|
|
32
|
+
```
|
|
33
|
+
eksi-scraper -t [thread1] [thread2] ... -f [inputFile.txt] -o (csv or json)
|
|
34
|
+
```
|
|
35
|
+
you can pass full URLs or just the slug (the part of the url after '/' and before '?'). for example:
|
|
36
|
+
|
|
37
|
+
```
|
|
38
|
+
eksi-scraper -t https://eksisozluk.com/murat-kurum--2582131 https://eksisozluk.com/ekrem-imamoglu--2577439 -o json
|
|
39
|
+
```
|
|
40
|
+
or using slugs:
|
|
41
|
+
```
|
|
42
|
+
eksi-scraper -t murat-kurum--2582131 ekrem-imamoglu--2577439 -o json
|
|
43
|
+
```
|
|
44
|
+
or from a file:
|
|
45
|
+
```
|
|
46
|
+
eksi-scraper -f threads.txt -o csv
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
where in threads.txt, threads are listed as URLs or slugs, one per line:
|
|
50
|
+
|
|
51
|
+
```
|
|
52
|
+
https://eksisozluk.com/murat-kurum--2582131
|
|
53
|
+
ekrem-imamoglu--2577439
|
|
54
|
+
...
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## contact
|
|
58
|
+
|
|
59
|
+
reach out to me at ceylaniberkay@gmail.com
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# eksi-scraper
|
|
2
|
+
|
|
3
|
+
asynchronously scrapes eksisozluk threads in python, and puts entries in a csv or json file, named after the thread. intended for educational purposes only.
|
|
4
|
+
|
|
5
|
+
## installation
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
uv pip install eksi-scraper
|
|
9
|
+
```
|
|
10
|
+
or with pip:
|
|
11
|
+
```
|
|
12
|
+
pip install eksi-scraper
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## usage
|
|
16
|
+
|
|
17
|
+
```
|
|
18
|
+
eksi-scraper -t [thread1] [thread2] ... -f [inputFile.txt] -o (csv or json)
|
|
19
|
+
```
|
|
20
|
+
you can pass full URLs or just the slug (the part of the url after '/' and before '?'). for example:
|
|
21
|
+
|
|
22
|
+
```
|
|
23
|
+
eksi-scraper -t https://eksisozluk.com/murat-kurum--2582131 https://eksisozluk.com/ekrem-imamoglu--2577439 -o json
|
|
24
|
+
```
|
|
25
|
+
or using slugs:
|
|
26
|
+
```
|
|
27
|
+
eksi-scraper -t murat-kurum--2582131 ekrem-imamoglu--2577439 -o json
|
|
28
|
+
```
|
|
29
|
+
or from a file:
|
|
30
|
+
```
|
|
31
|
+
eksi-scraper -f threads.txt -o csv
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
where in threads.txt, threads are listed as URLs or slugs, one per line:
|
|
35
|
+
|
|
36
|
+
```
|
|
37
|
+
https://eksisozluk.com/murat-kurum--2582131
|
|
38
|
+
ekrem-imamoglu--2577439
|
|
39
|
+
...
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## contact
|
|
43
|
+
|
|
44
|
+
reach out to me at ceylaniberkay@gmail.com
|
|
File without changes
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""Scrape threads from eksisozluk
|
|
2
|
+
|
|
3
|
+
Asynchronously scrapes threads from eksisozluk,
|
|
4
|
+
taking threads as command line arguments and
|
|
5
|
+
writes them to csv files. Some variables are too low
|
|
6
|
+
for any real scraping, but it's good for educational
|
|
7
|
+
purposes.
|
|
8
|
+
"""
|
|
9
|
+
from typing import List, Literal
|
|
10
|
+
import argparse
|
|
11
|
+
import asyncio
|
|
12
|
+
import sys
|
|
13
|
+
import logging
|
|
14
|
+
from urllib.parse import urlparse
|
|
15
|
+
from curl_cffi import requests
|
|
16
|
+
from .scraper import EksiSozlukScraper
|
|
17
|
+
from .data_writer import DataWriter
|
|
18
|
+
|
|
19
|
+
BASE_URL = 'https://www.eksisozluk.com/'
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def extract_slug(value: str) -> str:
|
|
23
|
+
"""Return the thread slug from either a full URL or a bare slug.
|
|
24
|
+
|
|
25
|
+
Examples:
|
|
26
|
+
'https://eksisozluk.com/murat-kurum--2582131?p=2' -> 'murat-kurum--2582131'
|
|
27
|
+
'murat-kurum--2582131' -> 'murat-kurum--2582131'
|
|
28
|
+
"""
|
|
29
|
+
parsed = urlparse(value)
|
|
30
|
+
if parsed.scheme in ('http', 'https'):
|
|
31
|
+
return parsed.path.strip('/')
|
|
32
|
+
return value
|
|
33
|
+
|
|
34
|
+
logging.basicConfig(filename='eksisozluk_scraper.log', level=logging.INFO,
|
|
35
|
+
format='%(asctime)s - %(message)s')
|
|
36
|
+
|
|
37
|
+
async def process_thread(scraper: EksiSozlukScraper,
|
|
38
|
+
session: requests.AsyncSession,
|
|
39
|
+
thread: str,
|
|
40
|
+
output_format: Literal['csv', 'json']) -> None:
|
|
41
|
+
"""
|
|
42
|
+
Process a thread, scrape it and write it to a file.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
scraper (EksiSozlukScraper): scraper object
|
|
46
|
+
session (requests.AsyncSession): session to make requests
|
|
47
|
+
thread (str): thread to scrape
|
|
48
|
+
output_format (csv or json): output format
|
|
49
|
+
"""
|
|
50
|
+
try:
|
|
51
|
+
logging.info('Started scraping thread %s', thread)
|
|
52
|
+
scraped_data = await scraper.scrape_thread(session, thread)
|
|
53
|
+
|
|
54
|
+
if scraped_data:
|
|
55
|
+
filename = f"{thread}.{output_format}"
|
|
56
|
+
DataWriter.write_data(filename, scraped_data, output_format)
|
|
57
|
+
logging.info(f"Successfully scraped and saved thread {thread} to {filename}")
|
|
58
|
+
else:
|
|
59
|
+
logging.warning(f"No data scraped for thread: {thread}")
|
|
60
|
+
|
|
61
|
+
except Exception as e:
|
|
62
|
+
logging.error(f"Unexpected error in process_thread: {e}")
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
async def main(threads: List[str], output_format: Literal['csv', 'json'] = 'csv') -> None:
|
|
66
|
+
"""
|
|
67
|
+
Main function to scrape threads from eksisozluk
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
threads (list): list of threads to scrape, part of the url after the /, before possibly ?.
|
|
71
|
+
"""
|
|
72
|
+
scraper = EksiSozlukScraper(BASE_URL)
|
|
73
|
+
header = {
|
|
74
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like\
|
|
75
|
+
Gecko) Chrome/58.0.3029.110 Safari/537.3',
|
|
76
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
async with requests.AsyncSession(headers=header, impersonate="chrome124") as session:
|
|
80
|
+
tasks = [process_thread(scraper, session, thread, output_format) for thread in threads]
|
|
81
|
+
await asyncio.gather(*tasks)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def cli():
|
|
85
|
+
parser = argparse.ArgumentParser(
|
|
86
|
+
description='Scrape threads from eksisozluk.com')
|
|
87
|
+
parser.add_argument('-t', '--threads',
|
|
88
|
+
metavar='thread',
|
|
89
|
+
required=False,
|
|
90
|
+
type=str,
|
|
91
|
+
nargs='+',
|
|
92
|
+
help='Threads to scrape. Accepts full URLs or slugs (part of the URL after /).')
|
|
93
|
+
parser.add_argument('-f', '--file',
|
|
94
|
+
metavar='file',
|
|
95
|
+
required=False,
|
|
96
|
+
type=str,
|
|
97
|
+
help='File to read threads from, one thread per line.')
|
|
98
|
+
parser.add_argument('-o', '--output',
|
|
99
|
+
choices=['csv', 'json'],
|
|
100
|
+
default='csv',
|
|
101
|
+
help='Output format (csv or json). Default is csv.')
|
|
102
|
+
args = parser.parse_args()
|
|
103
|
+
|
|
104
|
+
thread_list = [extract_slug(t) for t in args.threads] if args.threads else []
|
|
105
|
+
|
|
106
|
+
if args.file:
|
|
107
|
+
try:
|
|
108
|
+
with open(args.file, 'r', encoding='utf-8') as file:
|
|
109
|
+
thread_list.extend([extract_slug(line.strip()) for line in file.readlines()])
|
|
110
|
+
except IOError as e:
|
|
111
|
+
logging.error(f"Error reading file {args.file}: {e}")
|
|
112
|
+
sys.exit(1)
|
|
113
|
+
|
|
114
|
+
if thread_list:
|
|
115
|
+
asyncio.run(main(thread_list, args.output))
|
|
116
|
+
else:
|
|
117
|
+
logging.error('No threads provided. Exiting.')
|
|
118
|
+
parser.print_help()
|
|
119
|
+
sys.exit(1)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
if __name__ == '__main__':
|
|
123
|
+
cli()
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Module to write data to files in either CSV or JSON format.
|
|
3
|
+
|
|
4
|
+
Raises:
|
|
5
|
+
ValueError: if an unsupported format is specified.
|
|
6
|
+
"""
|
|
7
|
+
import csv
|
|
8
|
+
import json
|
|
9
|
+
from typing import List, Dict, Any, Literal
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DataWriter:
|
|
13
|
+
"""
|
|
14
|
+
Class to write data to files in either CSV or JSON format.
|
|
15
|
+
|
|
16
|
+
Raises:
|
|
17
|
+
ValueError: if an unsupported format is specified.
|
|
18
|
+
"""
|
|
19
|
+
@staticmethod
|
|
20
|
+
def write_data(filename: str,
|
|
21
|
+
data: List[Dict[str, Any]],
|
|
22
|
+
filetype: Literal['csv', 'json']) -> None:
|
|
23
|
+
"""
|
|
24
|
+
Write data to a file in either CSV or JSON format.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
filename (str): the name of the file to write.
|
|
28
|
+
data (List[Dict[str, Any]]): the data to write.
|
|
29
|
+
filetype (Literal['csv', 'json']): the output format ('csv' or 'json').
|
|
30
|
+
|
|
31
|
+
Raises:
|
|
32
|
+
ValueError: if an unsupported format is specified.
|
|
33
|
+
"""
|
|
34
|
+
if filetype == 'csv':
|
|
35
|
+
DataWriter._write_csv(filename, data)
|
|
36
|
+
elif filetype == 'json':
|
|
37
|
+
DataWriter._write_json(filename, data)
|
|
38
|
+
else:
|
|
39
|
+
raise ValueError(f"Unsupported format: {filetype}")
|
|
40
|
+
|
|
41
|
+
@staticmethod
|
|
42
|
+
def _write_csv(filename: str, data: List[Dict[str, Any]]) -> None:
|
|
43
|
+
with open(filename, 'w', encoding='utf-8', newline='') as f:
|
|
44
|
+
if not data:
|
|
45
|
+
return
|
|
46
|
+
fieldnames = data[0].keys()
|
|
47
|
+
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
48
|
+
writer.writeheader()
|
|
49
|
+
for row in data:
|
|
50
|
+
writer.writerow(row)
|
|
51
|
+
|
|
52
|
+
@staticmethod
|
|
53
|
+
def _write_json(filename: str, data: List[Dict[str, Any]]) -> None:
|
|
54
|
+
with open(filename, 'w', encoding='utf-8') as f:
|
|
55
|
+
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Contains the EksiSozlukScraper class. Scrapes threads from eksisozluk.com
|
|
3
|
+
"""
|
|
4
|
+
from typing import List, Dict, Any
|
|
5
|
+
import logging
|
|
6
|
+
import asyncio
|
|
7
|
+
|
|
8
|
+
from bs4 import BeautifulSoup
|
|
9
|
+
from curl_cffi import requests
|
|
10
|
+
import backoff
|
|
11
|
+
|
|
12
|
+
class EksiSozlukScraper:
|
|
13
|
+
"""
|
|
14
|
+
Scraper class for EksiSozluk. Handles the scraping logic of threads.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, base_url: str):
|
|
18
|
+
"""
|
|
19
|
+
Initializes the scraper with the base URL.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
base_url (str): The base URL of EksiSozluk.
|
|
23
|
+
"""
|
|
24
|
+
self.base_url = base_url
|
|
25
|
+
|
|
26
|
+
async def find_number_of_pages(self,
|
|
27
|
+
session: requests.AsyncSession,
|
|
28
|
+
url: str) -> int:
|
|
29
|
+
"""Finds the number of pages in a thread
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
session (requests.AsyncSession): session to make requests
|
|
33
|
+
url (str): url of the thread
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
int: number of pages in the thread.
|
|
37
|
+
"""
|
|
38
|
+
try:
|
|
39
|
+
response = await session.get(url)
|
|
40
|
+
if response.status_code != 200:
|
|
41
|
+
logging.error("Failed to fetch %s (status %s)",
|
|
42
|
+
url, response.status_code)
|
|
43
|
+
return 1
|
|
44
|
+
text = response.text
|
|
45
|
+
soup = BeautifulSoup(text, 'lxml')
|
|
46
|
+
pager_div = soup.find('div', class_='pager')
|
|
47
|
+
if pager_div and 'data-pagecount' in pager_div.attrs:
|
|
48
|
+
return int(pager_div['data-pagecount'])
|
|
49
|
+
return 1
|
|
50
|
+
except Exception as e:
|
|
51
|
+
logging.error(f"Unexpected error in find_number_of_pages: {e}")
|
|
52
|
+
return 1
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _parse_entry(self, entry: BeautifulSoup) -> Dict[str, Any]:
|
|
56
|
+
"""
|
|
57
|
+
Parses an entry and returns a dictionary with the content,
|
|
58
|
+
author, date created and last changed.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
entry (BeautifulSoup): an entry in the thread
|
|
62
|
+
"""
|
|
63
|
+
content_div = entry.find(class_='content')
|
|
64
|
+
# Replace shortened link text with the full URL from href
|
|
65
|
+
for a in content_div.find_all('a', href=True):
|
|
66
|
+
if a['href'].startswith('http'):
|
|
67
|
+
a.string = a['href']
|
|
68
|
+
content = content_div.get_text(separator=' ').strip()
|
|
69
|
+
author = entry.find(class_='entry-author').text.strip()
|
|
70
|
+
entry_date_text = entry.find(class_='entry-date').text.strip()
|
|
71
|
+
|
|
72
|
+
if '~' in entry_date_text:
|
|
73
|
+
date_created, last_changed = [
|
|
74
|
+
part.strip() for part in entry_date_text.split('~')]
|
|
75
|
+
else:
|
|
76
|
+
date_created = entry_date_text
|
|
77
|
+
last_changed = 'null'
|
|
78
|
+
|
|
79
|
+
return {
|
|
80
|
+
'Content': content,
|
|
81
|
+
'Author': author,
|
|
82
|
+
'Date Created': date_created,
|
|
83
|
+
'Last Changed': last_changed
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@backoff.on_exception(backoff.expo,
|
|
88
|
+
requests.RequestsError,
|
|
89
|
+
max_tries=8,
|
|
90
|
+
max_time=300)
|
|
91
|
+
async def scrape_page(self,
|
|
92
|
+
session: requests.AsyncSession,
|
|
93
|
+
url: str,
|
|
94
|
+
semaphore: asyncio.Semaphore) -> List[Dict[str, Any]]:
|
|
95
|
+
"""
|
|
96
|
+
Scrapes a page and appends the data to scraped_data
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
session (requests.AsyncSession): session to make requests
|
|
100
|
+
url (str): url of the page to scrape
|
|
101
|
+
semaphore (asyncio.Semaphore): semaphore to limit the number of concurrent requests
|
|
102
|
+
scraped_data (list): list to append the scraped data
|
|
103
|
+
"""
|
|
104
|
+
async with semaphore:
|
|
105
|
+
try:
|
|
106
|
+
response = await session.get(url)
|
|
107
|
+
if response.status_code != 200:
|
|
108
|
+
logging.error("Failed to fetch %s (status %s)",
|
|
109
|
+
url, response.status_code)
|
|
110
|
+
return []
|
|
111
|
+
text = response.text
|
|
112
|
+
soup = BeautifulSoup(text, 'lxml')
|
|
113
|
+
entries = soup.find_all(id='entry-item')
|
|
114
|
+
return [self._parse_entry(entry) for entry in entries]
|
|
115
|
+
except Exception as e:
|
|
116
|
+
logging.error(f"Unexpected error in scrape_page {url}: {e}")
|
|
117
|
+
return []
|
|
118
|
+
|
|
119
|
+
async def scrape_thread(self,
|
|
120
|
+
session: requests.AsyncSession,
|
|
121
|
+
thread: str,
|
|
122
|
+
max_concurrent_requests: int = 15):
|
|
123
|
+
"""
|
|
124
|
+
Scrapes a thread and writes the data to a csv file
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
session (requests.AsyncSession): session to make requests
|
|
128
|
+
thread (str): thread to scrape, the part of the url after the /, before, if exists, ?.
|
|
129
|
+
max_concurrent_requests (int, optional): max # of concurrent requests. Defaults to 15.
|
|
130
|
+
"""
|
|
131
|
+
semaphore = asyncio.Semaphore(max_concurrent_requests)
|
|
132
|
+
|
|
133
|
+
thread_url = self.base_url + thread
|
|
134
|
+
number_of_pages = await self.find_number_of_pages(session, thread_url)
|
|
135
|
+
tasks = [self.scrape_page(session, thread_url + '?p=' + str(page), semaphore)
|
|
136
|
+
for page in range(1, int(number_of_pages) + 1)]
|
|
137
|
+
results = await asyncio.gather(*tasks)
|
|
138
|
+
|
|
139
|
+
return [entry for page in results for entry in page]
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "eksi-scraper"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "asynchronously scrapes eksisozluk threads and exports to csv or json"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Ibrahim Berkay Ceylan", email = "ceylaniberkay@gmail.com" }
|
|
14
|
+
]
|
|
15
|
+
dependencies = [
|
|
16
|
+
"backoff>=2.2",
|
|
17
|
+
"beautifulsoup4>=4.12",
|
|
18
|
+
"lxml>=5.1",
|
|
19
|
+
"curl_cffi>=0.7",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[tool.hatch.build.targets.wheel]
|
|
23
|
+
packages = ["eksisozluk_scraper"]
|
|
24
|
+
|
|
25
|
+
[project.urls]
|
|
26
|
+
Homepage = "https://github.com/iberkayC/eksi-scraper"
|
|
27
|
+
|
|
28
|
+
[project.scripts]
|
|
29
|
+
eksi-scraper = "eksisozluk_scraper.cli:cli"
|