pocong 1.0.0__tar.gz → 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pocong-1.1.0/PKG-INFO +229 -0
- pocong-1.1.0/README.md +175 -0
- {pocong-1.0.0 → pocong-1.1.0}/setup.cfg +7 -0
- {pocong-1.0.0 → pocong-1.1.0}/setup.py +4 -0
- {pocong-1.0.0 → pocong-1.1.0}/src/pocong/_version.py +3 -3
- pocong-1.1.0/src/pocong/media_spiders/__init__.py +231 -0
- {pocong-1.0.0 → pocong-1.1.0}/src/pocong/proxy_spiders/__init__.py +21 -23
- pocong-1.1.0/src/pocong.egg-info/PKG-INFO +229 -0
- {pocong-1.0.0 → pocong-1.1.0}/src/pocong.egg-info/SOURCES.txt +2 -0
- {pocong-1.0.0 → pocong-1.1.0}/src/pocong.egg-info/requires.txt +4 -0
- pocong-1.1.0/tests/test_media_spiders.py +234 -0
- pocong-1.0.0/PKG-INFO +0 -114
- pocong-1.0.0/README.md +0 -64
- pocong-1.0.0/src/pocong.egg-info/PKG-INFO +0 -114
- {pocong-1.0.0 → pocong-1.1.0}/MANIFEST.in +0 -0
- {pocong-1.0.0 → pocong-1.1.0}/pyproject.toml +0 -0
- {pocong-1.0.0 → pocong-1.1.0}/src/pocong/__init__.py +0 -0
- {pocong-1.0.0 → pocong-1.1.0}/src/pocong/cli.py +0 -0
- {pocong-1.0.0 → pocong-1.1.0}/src/pocong/pocong.py +0 -0
- {pocong-1.0.0 → pocong-1.1.0}/src/pocong/proxy_spiders/pipelines.py +0 -0
- {pocong-1.0.0 → pocong-1.1.0}/src/pocong/proxy_spiders/spiders/__init__.py +0 -0
- {pocong-1.0.0 → pocong-1.1.0}/src/pocong/proxy_spiders/spiders/free_proxy_list_net_spider.py +0 -0
- {pocong-1.0.0 → pocong-1.1.0}/src/pocong/utils.py +0 -0
- {pocong-1.0.0 → pocong-1.1.0}/src/pocong.egg-info/dependency_links.txt +0 -0
- {pocong-1.0.0 → pocong-1.1.0}/src/pocong.egg-info/entry_points.txt +0 -0
- {pocong-1.0.0 → pocong-1.1.0}/src/pocong.egg-info/top_level.txt +0 -0
- {pocong-1.0.0 → pocong-1.1.0}/tests/test_pocong.py +0 -0
- {pocong-1.0.0 → pocong-1.1.0}/tests/test_proxy_spiders.py +0 -0
- {pocong-1.0.0 → pocong-1.1.0}/versioneer.py +0 -0
pocong-1.1.0/PKG-INFO
ADDED
@@ -0,0 +1,229 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: pocong
|
3
|
+
Version: 1.1.0
|
4
|
+
Summary: Python Oriented Crawling Ongoing (POCONG): a simple crawling framework
|
5
|
+
Home-page: https://gitlab.com/mohsin3107/pocong
|
6
|
+
Author: Singgih
|
7
|
+
Author-email: singgih@alkode.id
|
8
|
+
License: MIT
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
10
|
+
Classifier: Intended Audience :: Developers
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
17
|
+
Classifier: Operating System :: OS Independent
|
18
|
+
Classifier: Topic :: Software Development :: Libraries
|
19
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
20
|
+
Requires-Python: >=3.8
|
21
|
+
Description-Content-Type: text/markdown
|
22
|
+
Requires-Dist: Scrapy>=2.5.0
|
23
|
+
Requires-Dist: pandas>=1.3.0
|
24
|
+
Requires-Dist: requests>=2.25.0
|
25
|
+
Requires-Dist: Click>=7.0
|
26
|
+
Requires-Dist: mechanize>=0.4.0
|
27
|
+
Requires-Dist: html2text>=2020.1.16
|
28
|
+
Requires-Dist: fake-useragent>=1.1.0
|
29
|
+
Requires-Dist: beautifulsoup4>=4.9.0
|
30
|
+
Provides-Extra: dev
|
31
|
+
Requires-Dist: pytest; extra == "dev"
|
32
|
+
Requires-Dist: pytest-mock; extra == "dev"
|
33
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
34
|
+
Requires-Dist: moto; extra == "dev"
|
35
|
+
Requires-Dist: tox; extra == "dev"
|
36
|
+
Requires-Dist: flake8; extra == "dev"
|
37
|
+
Requires-Dist: flake8-import-order; extra == "dev"
|
38
|
+
Requires-Dist: flake8-print; extra == "dev"
|
39
|
+
Requires-Dist: flake8-builtins; extra == "dev"
|
40
|
+
Requires-Dist: pep8-naming; extra == "dev"
|
41
|
+
Requires-Dist: pre-commit; extra == "dev"
|
42
|
+
Requires-Dist: rope; extra == "dev"
|
43
|
+
Dynamic: author
|
44
|
+
Dynamic: author-email
|
45
|
+
Dynamic: classifier
|
46
|
+
Dynamic: description
|
47
|
+
Dynamic: description-content-type
|
48
|
+
Dynamic: home-page
|
49
|
+
Dynamic: license
|
50
|
+
Dynamic: provides-extra
|
51
|
+
Dynamic: requires-dist
|
52
|
+
Dynamic: requires-python
|
53
|
+
Dynamic: summary
|
54
|
+
|
55
|
+
<p align="center">
|
56
|
+
<img src="https://i.ibb.co.com/35P4Nq9x/Screenshot-2025-08-22-at-18-40-11.png?width=128" alt="POCONG Logo" width="128"/>
|
57
|
+
</p>
|
58
|
+
|
59
|
+
# POCONG 🪦
|
60
|
+
**Python Oriented Crawling ON Going**
|
61
|
+
|
62
|
+
POCONG is a lightweight web crawling framework built in Python.
|
63
|
+
|
64
|
+
## Features
|
65
|
+
|
66
|
+
- 🔒 **Get Free Proxy**: Automatic proxy fetching, validation, and rotation from free proxy sources
|
67
|
+
- 🌐 **Dynamic Media Web Scraping**: Extract content, metadata, and media information from web pages with proxy support
|
68
|
+
- 📱 **Social Media Scraping**: Extract data from social media platforms *(coming soon)*
|
69
|
+
- 🛒 **E-commerce Scraping**: Extract product information from e-commerce websites *(coming soon)*
|
70
|
+
|
71
|
+
## Installation
|
72
|
+
```bash
|
73
|
+
pip install pocong
|
74
|
+
```
|
75
|
+
|
76
|
+
## Usage: Get Proxy from proxy_spiders
|
77
|
+
|
78
|
+
You can use the `get_proxy` and `get_proxy_random` methods from `proxy_spiders` to fetch working proxies.
|
79
|
+
|
80
|
+
```python
|
81
|
+
from pocong.proxy_spiders import GetProxy
|
82
|
+
|
83
|
+
gp = GetProxy()
|
84
|
+
|
85
|
+
# Get the first working proxy
|
86
|
+
proxy = gp.get_proxy()
|
87
|
+
print("First working proxy:", proxy)
|
88
|
+
```
|
89
|
+
```python
|
90
|
+
from pocong.proxy_spiders import GetProxy
|
91
|
+
|
92
|
+
gp = GetProxy()
|
93
|
+
|
94
|
+
# Get a random working proxy
|
95
|
+
random_proxy = gp.get_proxy_random()
|
96
|
+
print("Random working proxy:", random_proxy)
|
97
|
+
```
|
98
|
+
|
99
|
+
Sample output:
|
100
|
+
```
|
101
|
+
First working proxy: {'ip': '123.45.67.89', 'port': '8080', 'https': 'yes', ...}
|
102
|
+
Random working proxy: {'ip': '98.76.54.32', 'port': '3128', 'https': 'yes', ...}
|
103
|
+
```
|
104
|
+
|
105
|
+
You can use the returned proxy dictionary with the `requests` library, for example:
|
106
|
+
|
107
|
+
```python
|
108
|
+
import requests
|
109
|
+
|
110
|
+
proxy = gp.get_proxy()
|
111
|
+
if proxy:
|
112
|
+
proxies = {
|
113
|
+
'http': f"http://{proxy['ip']}:{proxy['port']}",
|
114
|
+
'https': f"http://{proxy['ip']}:{proxy['port']}"
|
115
|
+
}
|
116
|
+
response = requests.get('https://httpbin.org/ip', proxies=proxies)
|
117
|
+
print(response.json())
|
118
|
+
else:
|
119
|
+
print("No working proxy found.")
|
120
|
+
```
|
121
|
+
|
122
|
+
- `get_proxy()` will return the first working proxy found.
|
123
|
+
- `get_proxy_random()` will return a random working proxy (with up to 20 retries).
|
124
|
+
|
125
|
+
Both methods return a dictionary with proxy details (e.g., `{ 'ip': '...', 'port': '...', ... }`) or `None` if no working proxy is found.
|
126
|
+
|
127
|
+
## Usage: Dynamic Media Web Scraping
|
128
|
+
|
129
|
+
The `DynamicScrapingNews` class provides comprehensive web scraping capabilities with built-in proxy support for extracting content, metadata, and media information from web pages.
|
130
|
+
|
131
|
+
### Basic Usage
|
132
|
+
|
133
|
+
```python
|
134
|
+
from pocong.media_spiders import DynamicScrapingNews
|
135
|
+
|
136
|
+
# Simple scraping without proxy
|
137
|
+
scraper = DynamicScrapingNews("https://example.com", use_proxy=False)
|
138
|
+
result = scraper.scrape()
|
139
|
+
|
140
|
+
# Extract specific information
|
141
|
+
print(f"Title: {result['title']}")
|
142
|
+
print(f"URL: {result['url']}")
|
143
|
+
print(f"Media: {result['media']}")
|
144
|
+
print(f"Published: {result['published_date']}")
|
145
|
+
print(f"Text content: {result['text'][:200]}...") # First 200 chars
|
146
|
+
```
|
147
|
+
|
148
|
+
### Proxy Configuration Options
|
149
|
+
|
150
|
+
#### 1. Automatic Proxy (Default)
|
151
|
+
```python
|
152
|
+
# Uses automatic proxy fetching
|
153
|
+
scraper = DynamicScrapingNews("https://example.com")
|
154
|
+
result = scraper.scrape()
|
155
|
+
```
|
156
|
+
|
157
|
+
#### 2. Manual Proxy Configuration
|
158
|
+
```python
|
159
|
+
# Method 1: IP:Port format
|
160
|
+
scraper = DynamicScrapingNews("https://example.com",
|
161
|
+
manual_proxy="192.168.1.1:8080")
|
162
|
+
|
163
|
+
# Method 2: Full URL format
|
164
|
+
scraper = DynamicScrapingNews("https://example.com",
|
165
|
+
manual_proxy="http://192.168.1.1:8080")
|
166
|
+
|
167
|
+
# Method 3: Dictionary format
|
168
|
+
scraper = DynamicScrapingNews("https://example.com",
|
169
|
+
manual_proxy={"ip": "192.168.1.1", "port": "8080"})
|
170
|
+
|
171
|
+
result = scraper.scrape()
|
172
|
+
```
|
173
|
+
|
174
|
+
#### 3. No Proxy
|
175
|
+
```python
|
176
|
+
# Disable proxy completely
|
177
|
+
scraper = DynamicScrapingNews("https://example.com", use_proxy=False)
|
178
|
+
result = scraper.scrape()
|
179
|
+
```
|
180
|
+
|
181
|
+
#### 4. Manual Proxy Override
|
182
|
+
```python
|
183
|
+
# Manual proxy overrides use_proxy setting
|
184
|
+
scraper = DynamicScrapingNews("https://example.com",
|
185
|
+
use_proxy=False,
|
186
|
+
manual_proxy="192.168.1.1:8080")
|
187
|
+
result = scraper.scrape()
|
188
|
+
```
|
189
|
+
|
190
|
+
### Complete Example with Proxy Integration
|
191
|
+
|
192
|
+
```python
|
193
|
+
from pocong.proxy_spiders import GetProxy
|
194
|
+
from pocong.media_spiders import DynamicScrapingNews
|
195
|
+
|
196
|
+
# Get a working proxy
|
197
|
+
proxy = GetProxy().get_proxy()
|
198
|
+
print(f"Using proxy: {proxy}")
|
199
|
+
|
200
|
+
# Use automatic proxy (default behavior)
|
201
|
+
scraper = DynamicScrapingNews("https://example.com")
|
202
|
+
result = scraper.scrape()
|
203
|
+
|
204
|
+
# Use manual proxy with ip:port format
|
205
|
+
scraper = DynamicScrapingNews("https://example.com",
|
206
|
+
manual_proxy=f"{proxy['ip']}:{proxy['port']}")
|
207
|
+
result = scraper.scrape()
|
208
|
+
|
209
|
+
# Use manual proxy with dictionary format
|
210
|
+
scraper = DynamicScrapingNews("https://example.com",
|
211
|
+
manual_proxy={"ip": proxy['ip'], "port": proxy['port']})
|
212
|
+
result = scraper.scrape()
|
213
|
+
```
|
214
|
+
|
215
|
+
### Extracted Data Structure
|
216
|
+
|
217
|
+
The `scrape()` method returns a dictionary containing:
|
218
|
+
|
219
|
+
```python
|
220
|
+
{
|
221
|
+
'title': 'Page Title', # Extracted from og:title or title tag
|
222
|
+
'url': 'https://example.com', # Canonical URL
|
223
|
+
'image': 'https://...', # Featured image URL
|
224
|
+
'html': '<html>...</html>', # Full HTML content
|
225
|
+
'text': 'Clean text content', # Processed text without HTML
|
226
|
+
'media': 'example', # Domain name extracted from URL
|
227
|
+
'published_date': datetime(...) # Publication date if found
|
228
|
+
}
|
229
|
+
```
|
pocong-1.1.0/README.md
ADDED
@@ -0,0 +1,175 @@
|
|
1
|
+
<p align="center">
|
2
|
+
<img src="https://i.ibb.co.com/35P4Nq9x/Screenshot-2025-08-22-at-18-40-11.png?width=128" alt="POCONG Logo" width="128"/>
|
3
|
+
</p>
|
4
|
+
|
5
|
+
# POCONG 🪦
|
6
|
+
**Python Oriented Crawling ON Going**
|
7
|
+
|
8
|
+
POCONG is a lightweight web crawling framework built in Python.
|
9
|
+
|
10
|
+
## Features
|
11
|
+
|
12
|
+
- 🔒 **Get Free Proxy**: Automatic proxy fetching, validation, and rotation from free proxy sources
|
13
|
+
- 🌐 **Dynamic Media Web Scraping**: Extract content, metadata, and media information from web pages with proxy support
|
14
|
+
- 📱 **Social Media Scraping**: Extract data from social media platforms *(coming soon)*
|
15
|
+
- 🛒 **E-commerce Scraping**: Extract product information from e-commerce websites *(coming soon)*
|
16
|
+
|
17
|
+
## Installation
|
18
|
+
```bash
|
19
|
+
pip install pocong
|
20
|
+
```
|
21
|
+
|
22
|
+
## Usage: Get Proxy from proxy_spiders
|
23
|
+
|
24
|
+
You can use the `get_proxy` and `get_proxy_random` methods from `proxy_spiders` to fetch working proxies.
|
25
|
+
|
26
|
+
```python
|
27
|
+
from pocong.proxy_spiders import GetProxy
|
28
|
+
|
29
|
+
gp = GetProxy()
|
30
|
+
|
31
|
+
# Get the first working proxy
|
32
|
+
proxy = gp.get_proxy()
|
33
|
+
print("First working proxy:", proxy)
|
34
|
+
```
|
35
|
+
```python
|
36
|
+
from pocong.proxy_spiders import GetProxy
|
37
|
+
|
38
|
+
gp = GetProxy()
|
39
|
+
|
40
|
+
# Get a random working proxy
|
41
|
+
random_proxy = gp.get_proxy_random()
|
42
|
+
print("Random working proxy:", random_proxy)
|
43
|
+
```
|
44
|
+
|
45
|
+
Sample output:
|
46
|
+
```
|
47
|
+
First working proxy: {'ip': '123.45.67.89', 'port': '8080', 'https': 'yes', ...}
|
48
|
+
Random working proxy: {'ip': '98.76.54.32', 'port': '3128', 'https': 'yes', ...}
|
49
|
+
```
|
50
|
+
|
51
|
+
You can use the returned proxy dictionary with the `requests` library, for example:
|
52
|
+
|
53
|
+
```python
|
54
|
+
import requests
|
55
|
+
|
56
|
+
proxy = gp.get_proxy()
|
57
|
+
if proxy:
|
58
|
+
proxies = {
|
59
|
+
'http': f"http://{proxy['ip']}:{proxy['port']}",
|
60
|
+
'https': f"http://{proxy['ip']}:{proxy['port']}"
|
61
|
+
}
|
62
|
+
response = requests.get('https://httpbin.org/ip', proxies=proxies)
|
63
|
+
print(response.json())
|
64
|
+
else:
|
65
|
+
print("No working proxy found.")
|
66
|
+
```
|
67
|
+
|
68
|
+
- `get_proxy()` will return the first working proxy found.
|
69
|
+
- `get_proxy_random()` will return a random working proxy (with up to 20 retries).
|
70
|
+
|
71
|
+
Both methods return a dictionary with proxy details (e.g., `{ 'ip': '...', 'port': '...', ... }`) or `None` if no working proxy is found.
|
72
|
+
|
73
|
+
## Usage: Dynamic Media Web Scraping
|
74
|
+
|
75
|
+
The `DynamicScrapingNews` class provides comprehensive web scraping capabilities with built-in proxy support for extracting content, metadata, and media information from web pages.
|
76
|
+
|
77
|
+
### Basic Usage
|
78
|
+
|
79
|
+
```python
|
80
|
+
from pocong.media_spiders import DynamicScrapingNews
|
81
|
+
|
82
|
+
# Simple scraping without proxy
|
83
|
+
scraper = DynamicScrapingNews("https://example.com", use_proxy=False)
|
84
|
+
result = scraper.scrape()
|
85
|
+
|
86
|
+
# Extract specific information
|
87
|
+
print(f"Title: {result['title']}")
|
88
|
+
print(f"URL: {result['url']}")
|
89
|
+
print(f"Media: {result['media']}")
|
90
|
+
print(f"Published: {result['published_date']}")
|
91
|
+
print(f"Text content: {result['text'][:200]}...") # First 200 chars
|
92
|
+
```
|
93
|
+
|
94
|
+
### Proxy Configuration Options
|
95
|
+
|
96
|
+
#### 1. Automatic Proxy (Default)
|
97
|
+
```python
|
98
|
+
# Uses automatic proxy fetching
|
99
|
+
scraper = DynamicScrapingNews("https://example.com")
|
100
|
+
result = scraper.scrape()
|
101
|
+
```
|
102
|
+
|
103
|
+
#### 2. Manual Proxy Configuration
|
104
|
+
```python
|
105
|
+
# Method 1: IP:Port format
|
106
|
+
scraper = DynamicScrapingNews("https://example.com",
|
107
|
+
manual_proxy="192.168.1.1:8080")
|
108
|
+
|
109
|
+
# Method 2: Full URL format
|
110
|
+
scraper = DynamicScrapingNews("https://example.com",
|
111
|
+
manual_proxy="http://192.168.1.1:8080")
|
112
|
+
|
113
|
+
# Method 3: Dictionary format
|
114
|
+
scraper = DynamicScrapingNews("https://example.com",
|
115
|
+
manual_proxy={"ip": "192.168.1.1", "port": "8080"})
|
116
|
+
|
117
|
+
result = scraper.scrape()
|
118
|
+
```
|
119
|
+
|
120
|
+
#### 3. No Proxy
|
121
|
+
```python
|
122
|
+
# Disable proxy completely
|
123
|
+
scraper = DynamicScrapingNews("https://example.com", use_proxy=False)
|
124
|
+
result = scraper.scrape()
|
125
|
+
```
|
126
|
+
|
127
|
+
#### 4. Manual Proxy Override
|
128
|
+
```python
|
129
|
+
# Manual proxy overrides use_proxy setting
|
130
|
+
scraper = DynamicScrapingNews("https://example.com",
|
131
|
+
use_proxy=False,
|
132
|
+
manual_proxy="192.168.1.1:8080")
|
133
|
+
result = scraper.scrape()
|
134
|
+
```
|
135
|
+
|
136
|
+
### Complete Example with Proxy Integration
|
137
|
+
|
138
|
+
```python
|
139
|
+
from pocong.proxy_spiders import GetProxy
|
140
|
+
from pocong.media_spiders import DynamicScrapingNews
|
141
|
+
|
142
|
+
# Get a working proxy
|
143
|
+
proxy = GetProxy().get_proxy()
|
144
|
+
print(f"Using proxy: {proxy}")
|
145
|
+
|
146
|
+
# Use automatic proxy (default behavior)
|
147
|
+
scraper = DynamicScrapingNews("https://example.com")
|
148
|
+
result = scraper.scrape()
|
149
|
+
|
150
|
+
# Use manual proxy with ip:port format
|
151
|
+
scraper = DynamicScrapingNews("https://example.com",
|
152
|
+
manual_proxy=f"{proxy['ip']}:{proxy['port']}")
|
153
|
+
result = scraper.scrape()
|
154
|
+
|
155
|
+
# Use manual proxy with dictionary format
|
156
|
+
scraper = DynamicScrapingNews("https://example.com",
|
157
|
+
manual_proxy={"ip": proxy['ip'], "port": proxy['port']})
|
158
|
+
result = scraper.scrape()
|
159
|
+
```
|
160
|
+
|
161
|
+
### Extracted Data Structure
|
162
|
+
|
163
|
+
The `scrape()` method returns a dictionary containing:
|
164
|
+
|
165
|
+
```python
|
166
|
+
{
|
167
|
+
'title': 'Page Title', # Extracted from og:title or title tag
|
168
|
+
'url': 'https://example.com', # Canonical URL
|
169
|
+
'image': 'https://...', # Featured image URL
|
170
|
+
'html': '<html>...</html>', # Full HTML content
|
171
|
+
'text': 'Clean text content', # Processed text without HTML
|
172
|
+
'media': 'example', # Domain name extracted from URL
|
173
|
+
'published_date': datetime(...) # Publication date if found
|
174
|
+
}
|
175
|
+
```
|
@@ -46,7 +46,14 @@ package_dir =
|
|
46
46
|
packages = find:
|
47
47
|
python_requires = >=3.8
|
48
48
|
install_requires =
|
49
|
+
Scrapy>=2.5.0
|
50
|
+
pandas>=1.3.0
|
51
|
+
requests>=2.25.0
|
49
52
|
Click>=7.0
|
53
|
+
mechanize>=0.4.0
|
54
|
+
html2text>=2020.1.16
|
55
|
+
fake-useragent>=1.1.0
|
56
|
+
beautifulsoup4>=4.9.0
|
50
57
|
|
51
58
|
[options.extras_require]
|
52
59
|
dev =
|
@@ -8,11 +8,11 @@ import json
|
|
8
8
|
|
9
9
|
version_json = '''
|
10
10
|
{
|
11
|
-
"date": "2025-
|
11
|
+
"date": "2025-10-11T12:21:52+0700",
|
12
12
|
"dirty": false,
|
13
13
|
"error": null,
|
14
|
-
"full-revisionid": "
|
15
|
-
"version": "1.
|
14
|
+
"full-revisionid": "4578ac0583c995b27ae2ec3e0297768410ee0bbc",
|
15
|
+
"version": "1.1.0"
|
16
16
|
}
|
17
17
|
''' # END VERSION_JSON
|
18
18
|
|
@@ -0,0 +1,231 @@
|
|
1
|
+
import datetime as dt
|
2
|
+
import re
|
3
|
+
|
4
|
+
import mechanize
|
5
|
+
import requests
|
6
|
+
from bs4 import BeautifulSoup
|
7
|
+
from fake_useragent import UserAgent
|
8
|
+
from html2text import html2text
|
9
|
+
|
10
|
+
try:
|
11
|
+
from pocong.proxy_spiders import GetProxy
|
12
|
+
PROXY_AVAILABLE = True
|
13
|
+
except ImportError:
|
14
|
+
PROXY_AVAILABLE = False
|
15
|
+
|
16
|
+
|
17
|
+
class DynamicScrapingNews():
|
18
|
+
def __init__(self, url, use_proxy=True, manual_proxy=None):
|
19
|
+
self.url = url
|
20
|
+
self.use_proxy = use_proxy
|
21
|
+
self.proxy = None
|
22
|
+
|
23
|
+
# Use manual proxy if provided
|
24
|
+
if manual_proxy:
|
25
|
+
self.proxy = self._format_proxy(manual_proxy)
|
26
|
+
# Otherwise, initialize proxy if available and requested
|
27
|
+
elif self.use_proxy and PROXY_AVAILABLE:
|
28
|
+
try:
|
29
|
+
proxy_getter = GetProxy()
|
30
|
+
proxy_data = proxy_getter.get_proxy_random()
|
31
|
+
if proxy_data:
|
32
|
+
self.proxy = f"http://{proxy_data['ip']}:{proxy_data['port']}"
|
33
|
+
except Exception:
|
34
|
+
# If proxy initialization fails, continue without proxy
|
35
|
+
self.proxy = None
|
36
|
+
|
37
|
+
def _format_proxy(self, proxy):
|
38
|
+
"""
|
39
|
+
Format proxy to ensure it has the correct format.
|
40
|
+
Accepts formats like:
|
41
|
+
- "ip:port"
|
42
|
+
- "http://ip:port"
|
43
|
+
- "https://ip:port"
|
44
|
+
- {"ip": "x.x.x.x", "port": "xxxx"}
|
45
|
+
"""
|
46
|
+
if isinstance(proxy, dict):
|
47
|
+
# If proxy is a dict with ip and port
|
48
|
+
if 'ip' in proxy and 'port' in proxy:
|
49
|
+
return f"http://{proxy['ip']}:{proxy['port']}"
|
50
|
+
else:
|
51
|
+
raise ValueError("Manual proxy dict must contain 'ip' and 'port' keys")
|
52
|
+
elif isinstance(proxy, str):
|
53
|
+
# If proxy is a string
|
54
|
+
if proxy.startswith(('http://', 'https://')):
|
55
|
+
return proxy
|
56
|
+
else:
|
57
|
+
# Assume it's in ip:port format
|
58
|
+
return f"http://{proxy}"
|
59
|
+
else:
|
60
|
+
raise ValueError("Manual proxy must be a string or dict")
|
61
|
+
|
62
|
+
def _remove_html_tags(self, text):
|
63
|
+
# This regular expression will match any HTML tag and capture its contents.
|
64
|
+
html_tags_pattern = r'<.*?>'
|
65
|
+
# Use re.sub to replace all matches with an empty string.
|
66
|
+
clean_text = re.sub(html_tags_pattern, '', text)
|
67
|
+
return clean_text
|
68
|
+
|
69
|
+
def _get_metadata(self, html, list_metadata=['title', 'url', 'image']):
|
70
|
+
result = dict()
|
71
|
+
for metadata in list_metadata:
|
72
|
+
# Define the regular expression pattern
|
73
|
+
pattern = r'property="og:{}" content="([^"]+)"'.format(metadata)
|
74
|
+
|
75
|
+
# Search for the pattern in the HTML content
|
76
|
+
match = re.search(pattern, html)
|
77
|
+
|
78
|
+
if match:
|
79
|
+
# Extract the content from the matched group
|
80
|
+
og_content = match.group(1)
|
81
|
+
result[metadata] = og_content if '?' not in og_content else og_content.split('?')[0]
|
82
|
+
else:
|
83
|
+
if metadata == 'url':
|
84
|
+
result[metadata] = self.url if '?' not in self.url else self.url.split('?')[0]
|
85
|
+
else:
|
86
|
+
result[metadata] = "Pattern not found in the HTML content."
|
87
|
+
result[metadata] = self._remove_html_tags(BeautifulSoup(result[metadata], 'html.parser').get_text())
|
88
|
+
result[metadata] = re.sub(r"&", "&", result[metadata])
|
89
|
+
return result
|
90
|
+
|
91
|
+
def _clean_html_to_text(self, html):
|
92
|
+
# First we remove inline JavaScript/CSS:
|
93
|
+
cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
|
94
|
+
# Then we remove html comments. This has to be done before removing regular
|
95
|
+
# tags since comments can contain '>' characters.
|
96
|
+
cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
|
97
|
+
# Next we can remove the remaining tags:
|
98
|
+
cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
|
99
|
+
# Finally, we deal with whitespace
|
100
|
+
cleaned = re.sub(r" ", " ", cleaned)
|
101
|
+
cleaned = re.sub(r" ", " ", cleaned)
|
102
|
+
cleaned = re.sub(r" ", " ", cleaned)
|
103
|
+
text = html2text(cleaned).format('utf-8')
|
104
|
+
spe_char = [
|
105
|
+
'\\u0621', '\\u0622', '\\u0625', '\\u0627', '\\u0629', '\\u062a', '\\u062b', '\\u062c', '\\u062f',
|
106
|
+
'\\u0631', '\\u0632', '\\u0633', '\\u0634', '\\u0636', '\\u0637', '\\u0639', '\\u063a', '\\u0641',
|
107
|
+
'\\u0643', '\\u0644', '\\u0645', '\\u0646', '\\u0647', '\\u0648', '\\u064a'
|
108
|
+
]
|
109
|
+
for char in spe_char:
|
110
|
+
text = text.replace(char, '')
|
111
|
+
return text.strip()
|
112
|
+
|
113
|
+
def _get_media(self, url):
|
114
|
+
# Define a regular expression pattern to match the main domain (excluding "sport" and subdomains)
|
115
|
+
pattern = r"https?://(?:www\.)?(?:[^./]+\.)*([^.]+\.\w+)"
|
116
|
+
|
117
|
+
# Use re.search to find the first match
|
118
|
+
match = re.search(pattern, url.replace('.co.', '.'))
|
119
|
+
|
120
|
+
# Extract the matched domain
|
121
|
+
if match:
|
122
|
+
domain = match.group(1)
|
123
|
+
return domain.split('.')[0]
|
124
|
+
else:
|
125
|
+
return None
|
126
|
+
|
127
|
+
def _get_pubdate(self, html):
|
128
|
+
# Define a regular expression pattern to match the content attribute value
|
129
|
+
pattern = r'content="(\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2})"'
|
130
|
+
|
131
|
+
# Use re.search to find the first match
|
132
|
+
match = re.search(pattern, html)
|
133
|
+
|
134
|
+
# Extract the matched content attribute value
|
135
|
+
if match:
|
136
|
+
content_value = match.group(1)
|
137
|
+
|
138
|
+
# Convert the content value to a datetime format
|
139
|
+
datetime_format = "%Y/%m/%d %H:%M:%S"
|
140
|
+
parsed_datetime = dt.datetime.strptime(content_value, datetime_format)
|
141
|
+
|
142
|
+
return parsed_datetime
|
143
|
+
else:
|
144
|
+
# Define a regular expression pattern to match the content attribute value
|
145
|
+
pattern = r'content="(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})"'
|
146
|
+
|
147
|
+
# Use re.search to find the first match
|
148
|
+
match = re.search(pattern, html)
|
149
|
+
|
150
|
+
# Extract the matched content attribute value
|
151
|
+
if match:
|
152
|
+
content_value = match.group(1)
|
153
|
+
|
154
|
+
# Convert the content value to a datetime format
|
155
|
+
datetime_format = "%Y-%m-%d %H:%M:%S"
|
156
|
+
parsed_datetime = dt.datetime.strptime(content_value, datetime_format)
|
157
|
+
|
158
|
+
return parsed_datetime
|
159
|
+
else:
|
160
|
+
# Define a regular expression pattern to match the content attribute value
|
161
|
+
pattern = r'content="(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\+\d{2}:\d{2})"'
|
162
|
+
|
163
|
+
# Use re.search to find the first match
|
164
|
+
match = re.search(pattern, html)
|
165
|
+
|
166
|
+
# Extract the matched content attribute value
|
167
|
+
if match:
|
168
|
+
content_value = match.group(1)
|
169
|
+
|
170
|
+
# Convert the content value to a datetime format
|
171
|
+
datetime_format = "%Y-%m-%dT%H:%M:%S%z"
|
172
|
+
parsed_datetime = dt.datetime.strptime(content_value, datetime_format)
|
173
|
+
|
174
|
+
return parsed_datetime
|
175
|
+
else:
|
176
|
+
return None
|
177
|
+
|
178
|
+
def _get_html(self, url):
|
179
|
+
# random useragent
|
180
|
+
ua = UserAgent()
|
181
|
+
user_agent = ua.random
|
182
|
+
headers = {'User-Agent': user_agent}
|
183
|
+
|
184
|
+
# Try with mechanize first (with proxy if available)
|
185
|
+
try:
|
186
|
+
br = mechanize.Browser()
|
187
|
+
br.set_handle_robots(False)
|
188
|
+
br.addheaders = [('User-Agent', user_agent)]
|
189
|
+
|
190
|
+
# Set proxy for mechanize if available
|
191
|
+
if self.proxy:
|
192
|
+
br.set_proxies({'http': self.proxy, 'https': self.proxy})
|
193
|
+
|
194
|
+
html = br.open(url).read().decode('utf-8')
|
195
|
+
return html
|
196
|
+
except Exception:
|
197
|
+
# Fallback to requests (with proxy if available)
|
198
|
+
try:
|
199
|
+
proxies = {'http': self.proxy, 'https': self.proxy} if self.proxy else None
|
200
|
+
response = requests.get(url, headers=headers, proxies=proxies, timeout=30)
|
201
|
+
html = response.content.decode('utf-8')
|
202
|
+
return html
|
203
|
+
except Exception:
|
204
|
+
# Final fallback without proxy
|
205
|
+
response = requests.get(url, headers=headers, timeout=30)
|
206
|
+
html = response.content.decode('utf-8')
|
207
|
+
return html
|
208
|
+
|
209
|
+
def scrape(self):
|
210
|
+
# get html from url
|
211
|
+
html = self._get_html(self.url)
|
212
|
+
|
213
|
+
# get metadata
|
214
|
+
metadata = self._get_metadata(html)
|
215
|
+
|
216
|
+
# convert html to text
|
217
|
+
text = self._clean_html_to_text(html)
|
218
|
+
|
219
|
+
# get media from url
|
220
|
+
media = self._get_media(self.url)
|
221
|
+
|
222
|
+
# get published_date from html
|
223
|
+
published_date = self._get_pubdate(html)
|
224
|
+
|
225
|
+
# combine result
|
226
|
+
metadata['html'] = html
|
227
|
+
metadata['text'] = text
|
228
|
+
metadata['media'] = media
|
229
|
+
metadata['published_date'] = published_date
|
230
|
+
|
231
|
+
return metadata
|