pocong 1.0.0__tar.gz → 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. pocong-1.1.0/PKG-INFO +229 -0
  2. pocong-1.1.0/README.md +175 -0
  3. {pocong-1.0.0 → pocong-1.1.0}/setup.cfg +7 -0
  4. {pocong-1.0.0 → pocong-1.1.0}/setup.py +4 -0
  5. {pocong-1.0.0 → pocong-1.1.0}/src/pocong/_version.py +3 -3
  6. pocong-1.1.0/src/pocong/media_spiders/__init__.py +231 -0
  7. {pocong-1.0.0 → pocong-1.1.0}/src/pocong/proxy_spiders/__init__.py +21 -23
  8. pocong-1.1.0/src/pocong.egg-info/PKG-INFO +229 -0
  9. {pocong-1.0.0 → pocong-1.1.0}/src/pocong.egg-info/SOURCES.txt +2 -0
  10. {pocong-1.0.0 → pocong-1.1.0}/src/pocong.egg-info/requires.txt +4 -0
  11. pocong-1.1.0/tests/test_media_spiders.py +234 -0
  12. pocong-1.0.0/PKG-INFO +0 -114
  13. pocong-1.0.0/README.md +0 -64
  14. pocong-1.0.0/src/pocong.egg-info/PKG-INFO +0 -114
  15. {pocong-1.0.0 → pocong-1.1.0}/MANIFEST.in +0 -0
  16. {pocong-1.0.0 → pocong-1.1.0}/pyproject.toml +0 -0
  17. {pocong-1.0.0 → pocong-1.1.0}/src/pocong/__init__.py +0 -0
  18. {pocong-1.0.0 → pocong-1.1.0}/src/pocong/cli.py +0 -0
  19. {pocong-1.0.0 → pocong-1.1.0}/src/pocong/pocong.py +0 -0
  20. {pocong-1.0.0 → pocong-1.1.0}/src/pocong/proxy_spiders/pipelines.py +0 -0
  21. {pocong-1.0.0 → pocong-1.1.0}/src/pocong/proxy_spiders/spiders/__init__.py +0 -0
  22. {pocong-1.0.0 → pocong-1.1.0}/src/pocong/proxy_spiders/spiders/free_proxy_list_net_spider.py +0 -0
  23. {pocong-1.0.0 → pocong-1.1.0}/src/pocong/utils.py +0 -0
  24. {pocong-1.0.0 → pocong-1.1.0}/src/pocong.egg-info/dependency_links.txt +0 -0
  25. {pocong-1.0.0 → pocong-1.1.0}/src/pocong.egg-info/entry_points.txt +0 -0
  26. {pocong-1.0.0 → pocong-1.1.0}/src/pocong.egg-info/top_level.txt +0 -0
  27. {pocong-1.0.0 → pocong-1.1.0}/tests/test_pocong.py +0 -0
  28. {pocong-1.0.0 → pocong-1.1.0}/tests/test_proxy_spiders.py +0 -0
  29. {pocong-1.0.0 → pocong-1.1.0}/versioneer.py +0 -0
pocong-1.1.0/PKG-INFO ADDED
@@ -0,0 +1,229 @@
1
+ Metadata-Version: 2.4
2
+ Name: pocong
3
+ Version: 1.1.0
4
+ Summary: Python Oriented Crawling Ongoing (POCONG): a simple crawling framework
5
+ Home-page: https://gitlab.com/mohsin3107/pocong
6
+ Author: Singgih
7
+ Author-email: singgih@alkode.id
8
+ License: MIT
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Topic :: Software Development :: Libraries
19
+ Classifier: Topic :: Internet :: WWW/HTTP
20
+ Requires-Python: >=3.8
21
+ Description-Content-Type: text/markdown
22
+ Requires-Dist: Scrapy>=2.5.0
23
+ Requires-Dist: pandas>=1.3.0
24
+ Requires-Dist: requests>=2.25.0
25
+ Requires-Dist: Click>=7.0
26
+ Requires-Dist: mechanize>=0.4.0
27
+ Requires-Dist: html2text>=2020.1.16
28
+ Requires-Dist: fake-useragent>=1.1.0
29
+ Requires-Dist: beautifulsoup4>=4.9.0
30
+ Provides-Extra: dev
31
+ Requires-Dist: pytest; extra == "dev"
32
+ Requires-Dist: pytest-mock; extra == "dev"
33
+ Requires-Dist: pytest-cov; extra == "dev"
34
+ Requires-Dist: moto; extra == "dev"
35
+ Requires-Dist: tox; extra == "dev"
36
+ Requires-Dist: flake8; extra == "dev"
37
+ Requires-Dist: flake8-import-order; extra == "dev"
38
+ Requires-Dist: flake8-print; extra == "dev"
39
+ Requires-Dist: flake8-builtins; extra == "dev"
40
+ Requires-Dist: pep8-naming; extra == "dev"
41
+ Requires-Dist: pre-commit; extra == "dev"
42
+ Requires-Dist: rope; extra == "dev"
43
+ Dynamic: author
44
+ Dynamic: author-email
45
+ Dynamic: classifier
46
+ Dynamic: description
47
+ Dynamic: description-content-type
48
+ Dynamic: home-page
49
+ Dynamic: license
50
+ Dynamic: provides-extra
51
+ Dynamic: requires-dist
52
+ Dynamic: requires-python
53
+ Dynamic: summary
54
+
55
+ <p align="center">
56
+ <img src="https://i.ibb.co.com/35P4Nq9x/Screenshot-2025-08-22-at-18-40-11.png?width=128" alt="POCONG Logo" width="128"/>
57
+ </p>
58
+
59
+ # POCONG 🪦
60
+ **Python Oriented Crawling ON Going**
61
+
62
+ POCONG is a lightweight web crawling framework built in Python.
63
+
64
+ ## Features
65
+
66
+ - 🔒 **Get Free Proxy**: Automatic proxy fetching, validation, and rotation from free proxy sources
67
+ - 🌐 **Dynamic Media Web Scraping**: Extract content, metadata, and media information from web pages with proxy support
68
+ - 📱 **Social Media Scraping**: Extract data from social media platforms *(coming soon)*
69
+ - 🛒 **E-commerce Scraping**: Extract product information from e-commerce websites *(coming soon)*
70
+
71
+ ## Installation
72
+ ```bash
73
+ pip install pocong
74
+ ```
75
+
76
+ ## Usage: Get Proxy from proxy_spiders
77
+
78
+ You can use the `get_proxy` and `get_proxy_random` methods from `proxy_spiders` to fetch working proxies.
79
+
80
+ ```python
81
+ from pocong.proxy_spiders import GetProxy
82
+
83
+ gp = GetProxy()
84
+
85
+ # Get the first working proxy
86
+ proxy = gp.get_proxy()
87
+ print("First working proxy:", proxy)
88
+ ```
89
+ ```python
90
+ from pocong.proxy_spiders import GetProxy
91
+
92
+ gp = GetProxy()
93
+
94
+ # Get a random working proxy
95
+ random_proxy = gp.get_proxy_random()
96
+ print("Random working proxy:", random_proxy)
97
+ ```
98
+
99
+ Sample output:
100
+ ```
101
+ First working proxy: {'ip': '123.45.67.89', 'port': '8080', 'https': 'yes', ...}
102
+ Random working proxy: {'ip': '98.76.54.32', 'port': '3128', 'https': 'yes', ...}
103
+ ```
104
+
105
+ You can use the returned proxy dictionary with the `requests` library, for example:
106
+
107
+ ```python
108
+ import requests
109
+
110
+ proxy = gp.get_proxy()
111
+ if proxy:
112
+ proxies = {
113
+ 'http': f"http://{proxy['ip']}:{proxy['port']}",
114
+ 'https': f"http://{proxy['ip']}:{proxy['port']}"
115
+ }
116
+ response = requests.get('https://httpbin.org/ip', proxies=proxies)
117
+ print(response.json())
118
+ else:
119
+ print("No working proxy found.")
120
+ ```
121
+
122
+ - `get_proxy()` will return the first working proxy found.
123
+ - `get_proxy_random()` will return a random working proxy (with up to 20 retries).
124
+
125
+ Both methods return a dictionary with proxy details (e.g., `{ 'ip': '...', 'port': '...', ... }`) or `None` if no working proxy is found.
126
+
127
+ ## Usage: Dynamic Media Web Scraping
128
+
129
+ The `DynamicScrapingNews` class provides comprehensive web scraping capabilities with built-in proxy support for extracting content, metadata, and media information from web pages.
130
+
131
+ ### Basic Usage
132
+
133
+ ```python
134
+ from pocong.media_spiders import DynamicScrapingNews
135
+
136
+ # Simple scraping without proxy
137
+ scraper = DynamicScrapingNews("https://example.com", use_proxy=False)
138
+ result = scraper.scrape()
139
+
140
+ # Extract specific information
141
+ print(f"Title: {result['title']}")
142
+ print(f"URL: {result['url']}")
143
+ print(f"Media: {result['media']}")
144
+ print(f"Published: {result['published_date']}")
145
+ print(f"Text content: {result['text'][:200]}...") # First 200 chars
146
+ ```
147
+
148
+ ### Proxy Configuration Options
149
+
150
+ #### 1. Automatic Proxy (Default)
151
+ ```python
152
+ # Uses automatic proxy fetching
153
+ scraper = DynamicScrapingNews("https://example.com")
154
+ result = scraper.scrape()
155
+ ```
156
+
157
+ #### 2. Manual Proxy Configuration
158
+ ```python
159
+ # Method 1: IP:Port format
160
+ scraper = DynamicScrapingNews("https://example.com",
161
+ manual_proxy="192.168.1.1:8080")
162
+
163
+ # Method 2: Full URL format
164
+ scraper = DynamicScrapingNews("https://example.com",
165
+ manual_proxy="http://192.168.1.1:8080")
166
+
167
+ # Method 3: Dictionary format
168
+ scraper = DynamicScrapingNews("https://example.com",
169
+ manual_proxy={"ip": "192.168.1.1", "port": "8080"})
170
+
171
+ result = scraper.scrape()
172
+ ```
173
+
174
+ #### 3. No Proxy
175
+ ```python
176
+ # Disable proxy completely
177
+ scraper = DynamicScrapingNews("https://example.com", use_proxy=False)
178
+ result = scraper.scrape()
179
+ ```
180
+
181
+ #### 4. Manual Proxy Override
182
+ ```python
183
+ # Manual proxy overrides use_proxy setting
184
+ scraper = DynamicScrapingNews("https://example.com",
185
+ use_proxy=False,
186
+ manual_proxy="192.168.1.1:8080")
187
+ result = scraper.scrape()
188
+ ```
189
+
190
+ ### Complete Example with Proxy Integration
191
+
192
+ ```python
193
+ from pocong.proxy_spiders import GetProxy
194
+ from pocong.media_spiders import DynamicScrapingNews
195
+
196
+ # Get a working proxy
197
+ proxy = GetProxy().get_proxy()
198
+ print(f"Using proxy: {proxy}")
199
+
200
+ # Use automatic proxy (default behavior)
201
+ scraper = DynamicScrapingNews("https://example.com")
202
+ result = scraper.scrape()
203
+
204
+ # Use manual proxy with ip:port format
205
+ scraper = DynamicScrapingNews("https://example.com",
206
+ manual_proxy=f"{proxy['ip']}:{proxy['port']}")
207
+ result = scraper.scrape()
208
+
209
+ # Use manual proxy with dictionary format
210
+ scraper = DynamicScrapingNews("https://example.com",
211
+ manual_proxy={"ip": proxy['ip'], "port": proxy['port']})
212
+ result = scraper.scrape()
213
+ ```
214
+
215
+ ### Extracted Data Structure
216
+
217
+ The `scrape()` method returns a dictionary containing:
218
+
219
+ ```python
220
+ {
221
+ 'title': 'Page Title', # Extracted from og:title or title tag
222
+ 'url': 'https://example.com', # Canonical URL
223
+ 'image': 'https://...', # Featured image URL
224
+ 'html': '<html>...</html>', # Full HTML content
225
+ 'text': 'Clean text content', # Processed text without HTML
226
+ 'media': 'example', # Domain name extracted from URL
227
+ 'published_date': datetime(...) # Publication date if found
228
+ }
229
+ ```
pocong-1.1.0/README.md ADDED
@@ -0,0 +1,175 @@
1
+ <p align="center">
2
+ <img src="https://i.ibb.co.com/35P4Nq9x/Screenshot-2025-08-22-at-18-40-11.png?width=128" alt="POCONG Logo" width="128"/>
3
+ </p>
4
+
5
+ # POCONG 🪦
6
+ **Python Oriented Crawling ON Going**
7
+
8
+ POCONG is a lightweight web crawling framework built in Python.
9
+
10
+ ## Features
11
+
12
+ - 🔒 **Get Free Proxy**: Automatic proxy fetching, validation, and rotation from free proxy sources
13
+ - 🌐 **Dynamic Media Web Scraping**: Extract content, metadata, and media information from web pages with proxy support
14
+ - 📱 **Social Media Scraping**: Extract data from social media platforms *(coming soon)*
15
+ - 🛒 **E-commerce Scraping**: Extract product information from e-commerce websites *(coming soon)*
16
+
17
+ ## Installation
18
+ ```bash
19
+ pip install pocong
20
+ ```
21
+
22
+ ## Usage: Get Proxy from proxy_spiders
23
+
24
+ You can use the `get_proxy` and `get_proxy_random` methods from `proxy_spiders` to fetch working proxies.
25
+
26
+ ```python
27
+ from pocong.proxy_spiders import GetProxy
28
+
29
+ gp = GetProxy()
30
+
31
+ # Get the first working proxy
32
+ proxy = gp.get_proxy()
33
+ print("First working proxy:", proxy)
34
+ ```
35
+ ```python
36
+ from pocong.proxy_spiders import GetProxy
37
+
38
+ gp = GetProxy()
39
+
40
+ # Get a random working proxy
41
+ random_proxy = gp.get_proxy_random()
42
+ print("Random working proxy:", random_proxy)
43
+ ```
44
+
45
+ Sample output:
46
+ ```
47
+ First working proxy: {'ip': '123.45.67.89', 'port': '8080', 'https': 'yes', ...}
48
+ Random working proxy: {'ip': '98.76.54.32', 'port': '3128', 'https': 'yes', ...}
49
+ ```
50
+
51
+ You can use the returned proxy dictionary with the `requests` library, for example:
52
+
53
+ ```python
54
+ import requests
55
+
56
+ proxy = gp.get_proxy()
57
+ if proxy:
58
+ proxies = {
59
+ 'http': f"http://{proxy['ip']}:{proxy['port']}",
60
+ 'https': f"http://{proxy['ip']}:{proxy['port']}"
61
+ }
62
+ response = requests.get('https://httpbin.org/ip', proxies=proxies)
63
+ print(response.json())
64
+ else:
65
+ print("No working proxy found.")
66
+ ```
67
+
68
+ - `get_proxy()` will return the first working proxy found.
69
+ - `get_proxy_random()` will return a random working proxy (with up to 20 retries).
70
+
71
+ Both methods return a dictionary with proxy details (e.g., `{ 'ip': '...', 'port': '...', ... }`) or `None` if no working proxy is found.
72
+
73
+ ## Usage: Dynamic Media Web Scraping
74
+
75
+ The `DynamicScrapingNews` class provides comprehensive web scraping capabilities with built-in proxy support for extracting content, metadata, and media information from web pages.
76
+
77
+ ### Basic Usage
78
+
79
+ ```python
80
+ from pocong.media_spiders import DynamicScrapingNews
81
+
82
+ # Simple scraping without proxy
83
+ scraper = DynamicScrapingNews("https://example.com", use_proxy=False)
84
+ result = scraper.scrape()
85
+
86
+ # Extract specific information
87
+ print(f"Title: {result['title']}")
88
+ print(f"URL: {result['url']}")
89
+ print(f"Media: {result['media']}")
90
+ print(f"Published: {result['published_date']}")
91
+ print(f"Text content: {result['text'][:200]}...") # First 200 chars
92
+ ```
93
+
94
+ ### Proxy Configuration Options
95
+
96
+ #### 1. Automatic Proxy (Default)
97
+ ```python
98
+ # Uses automatic proxy fetching
99
+ scraper = DynamicScrapingNews("https://example.com")
100
+ result = scraper.scrape()
101
+ ```
102
+
103
+ #### 2. Manual Proxy Configuration
104
+ ```python
105
+ # Method 1: IP:Port format
106
+ scraper = DynamicScrapingNews("https://example.com",
107
+ manual_proxy="192.168.1.1:8080")
108
+
109
+ # Method 2: Full URL format
110
+ scraper = DynamicScrapingNews("https://example.com",
111
+ manual_proxy="http://192.168.1.1:8080")
112
+
113
+ # Method 3: Dictionary format
114
+ scraper = DynamicScrapingNews("https://example.com",
115
+ manual_proxy={"ip": "192.168.1.1", "port": "8080"})
116
+
117
+ result = scraper.scrape()
118
+ ```
119
+
120
+ #### 3. No Proxy
121
+ ```python
122
+ # Disable proxy completely
123
+ scraper = DynamicScrapingNews("https://example.com", use_proxy=False)
124
+ result = scraper.scrape()
125
+ ```
126
+
127
+ #### 4. Manual Proxy Override
128
+ ```python
129
+ # Manual proxy overrides use_proxy setting
130
+ scraper = DynamicScrapingNews("https://example.com",
131
+ use_proxy=False,
132
+ manual_proxy="192.168.1.1:8080")
133
+ result = scraper.scrape()
134
+ ```
135
+
136
+ ### Complete Example with Proxy Integration
137
+
138
+ ```python
139
+ from pocong.proxy_spiders import GetProxy
140
+ from pocong.media_spiders import DynamicScrapingNews
141
+
142
+ # Get a working proxy
143
+ proxy = GetProxy().get_proxy()
144
+ print(f"Using proxy: {proxy}")
145
+
146
+ # Use automatic proxy (default behavior)
147
+ scraper = DynamicScrapingNews("https://example.com")
148
+ result = scraper.scrape()
149
+
150
+ # Use manual proxy with ip:port format
151
+ scraper = DynamicScrapingNews("https://example.com",
152
+ manual_proxy=f"{proxy['ip']}:{proxy['port']}")
153
+ result = scraper.scrape()
154
+
155
+ # Use manual proxy with dictionary format
156
+ scraper = DynamicScrapingNews("https://example.com",
157
+ manual_proxy={"ip": proxy['ip'], "port": proxy['port']})
158
+ result = scraper.scrape()
159
+ ```
160
+
161
+ ### Extracted Data Structure
162
+
163
+ The `scrape()` method returns a dictionary containing:
164
+
165
+ ```python
166
+ {
167
+ 'title': 'Page Title', # Extracted from og:title or title tag
168
+ 'url': 'https://example.com', # Canonical URL
169
+ 'image': 'https://...', # Featured image URL
170
+ 'html': '<html>...</html>', # Full HTML content
171
+ 'text': 'Clean text content', # Processed text without HTML
172
+ 'media': 'example', # Domain name extracted from URL
173
+ 'published_date': datetime(...) # Publication date if found
174
+ }
175
+ ```
@@ -46,7 +46,14 @@ package_dir =
46
46
  packages = find:
47
47
  python_requires = >=3.8
48
48
  install_requires =
49
+ Scrapy>=2.5.0
50
+ pandas>=1.3.0
51
+ requests>=2.25.0
49
52
  Click>=7.0
53
+ mechanize>=0.4.0
54
+ html2text>=2020.1.16
55
+ fake-useragent>=1.1.0
56
+ beautifulsoup4>=4.9.0
50
57
 
51
58
  [options.extras_require]
52
59
  dev =
@@ -17,6 +17,10 @@ install_requires = [
17
17
  "pandas>=1.3.0",
18
18
  "requests>=2.25.0",
19
19
  "Click>=7.0",
20
+ "mechanize>=0.4.0",
21
+ "html2text>=2020.1.16",
22
+ "fake-useragent>=1.1.0",
23
+ "beautifulsoup4>=4.9.0",
20
24
  ]
21
25
 
22
26
  extras_require = {
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-09-21T17:59:23+0700",
11
+ "date": "2025-10-11T12:21:52+0700",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "9a2d0f58b89a546044fc52948ce274767aa450d4",
15
- "version": "1.0.0"
14
+ "full-revisionid": "4578ac0583c995b27ae2ec3e0297768410ee0bbc",
15
+ "version": "1.1.0"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -0,0 +1,231 @@
1
+ import datetime as dt
2
+ import re
3
+
4
+ import mechanize
5
+ import requests
6
+ from bs4 import BeautifulSoup
7
+ from fake_useragent import UserAgent
8
+ from html2text import html2text
9
+
10
+ try:
11
+ from pocong.proxy_spiders import GetProxy
12
+ PROXY_AVAILABLE = True
13
+ except ImportError:
14
+ PROXY_AVAILABLE = False
15
+
16
+
17
+ class DynamicScrapingNews():
18
+ def __init__(self, url, use_proxy=True, manual_proxy=None):
19
+ self.url = url
20
+ self.use_proxy = use_proxy
21
+ self.proxy = None
22
+
23
+ # Use manual proxy if provided
24
+ if manual_proxy:
25
+ self.proxy = self._format_proxy(manual_proxy)
26
+ # Otherwise, initialize proxy if available and requested
27
+ elif self.use_proxy and PROXY_AVAILABLE:
28
+ try:
29
+ proxy_getter = GetProxy()
30
+ proxy_data = proxy_getter.get_proxy_random()
31
+ if proxy_data:
32
+ self.proxy = f"http://{proxy_data['ip']}:{proxy_data['port']}"
33
+ except Exception:
34
+ # If proxy initialization fails, continue without proxy
35
+ self.proxy = None
36
+
37
+ def _format_proxy(self, proxy):
38
+ """
39
+ Format proxy to ensure it has the correct format.
40
+ Accepts formats like:
41
+ - "ip:port"
42
+ - "http://ip:port"
43
+ - "https://ip:port"
44
+ - {"ip": "x.x.x.x", "port": "xxxx"}
45
+ """
46
+ if isinstance(proxy, dict):
47
+ # If proxy is a dict with ip and port
48
+ if 'ip' in proxy and 'port' in proxy:
49
+ return f"http://{proxy['ip']}:{proxy['port']}"
50
+ else:
51
+ raise ValueError("Manual proxy dict must contain 'ip' and 'port' keys")
52
+ elif isinstance(proxy, str):
53
+ # If proxy is a string
54
+ if proxy.startswith(('http://', 'https://')):
55
+ return proxy
56
+ else:
57
+ # Assume it's in ip:port format
58
+ return f"http://{proxy}"
59
+ else:
60
+ raise ValueError("Manual proxy must be a string or dict")
61
+
62
+ def _remove_html_tags(self, text):
63
+ # This regular expression will match any HTML tag and capture its contents.
64
+ html_tags_pattern = r'<.*?>'
65
+ # Use re.sub to replace all matches with an empty string.
66
+ clean_text = re.sub(html_tags_pattern, '', text)
67
+ return clean_text
68
+
69
+ def _get_metadata(self, html, list_metadata=['title', 'url', 'image']):
70
+ result = dict()
71
+ for metadata in list_metadata:
72
+ # Define the regular expression pattern
73
+ pattern = r'property="og:{}" content="([^"]+)"'.format(metadata)
74
+
75
+ # Search for the pattern in the HTML content
76
+ match = re.search(pattern, html)
77
+
78
+ if match:
79
+ # Extract the content from the matched group
80
+ og_content = match.group(1)
81
+ result[metadata] = og_content if '?' not in og_content else og_content.split('?')[0]
82
+ else:
83
+ if metadata == 'url':
84
+ result[metadata] = self.url if '?' not in self.url else self.url.split('?')[0]
85
+ else:
86
+ result[metadata] = "Pattern not found in the HTML content."
87
+ result[metadata] = self._remove_html_tags(BeautifulSoup(result[metadata], 'html.parser').get_text())
88
+ result[metadata] = re.sub(r"&amp;", "&", result[metadata])
89
+ return result
90
+
91
+ def _clean_html_to_text(self, html):
92
+ # First we remove inline JavaScript/CSS:
93
+ cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
94
+ # Then we remove html comments. This has to be done before removing regular
95
+ # tags since comments can contain '>' characters.
96
+ cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
97
+ # Next we can remove the remaining tags:
98
+ cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
99
+ # Finally, we deal with whitespace
100
+ cleaned = re.sub(r"&nbsp;", " ", cleaned)
101
+ cleaned = re.sub(r" ", " ", cleaned)
102
+ cleaned = re.sub(r" ", " ", cleaned)
103
+ text = html2text(cleaned).format('utf-8')
104
+ spe_char = [
105
+ '\\u0621', '\\u0622', '\\u0625', '\\u0627', '\\u0629', '\\u062a', '\\u062b', '\\u062c', '\\u062f',
106
+ '\\u0631', '\\u0632', '\\u0633', '\\u0634', '\\u0636', '\\u0637', '\\u0639', '\\u063a', '\\u0641',
107
+ '\\u0643', '\\u0644', '\\u0645', '\\u0646', '\\u0647', '\\u0648', '\\u064a'
108
+ ]
109
+ for char in spe_char:
110
+ text = text.replace(char, '')
111
+ return text.strip()
112
+
113
+ def _get_media(self, url):
114
+ # Define a regular expression pattern to match the main domain (excluding "sport" and subdomains)
115
+ pattern = r"https?://(?:www\.)?(?:[^./]+\.)*([^.]+\.\w+)"
116
+
117
+ # Use re.search to find the first match
118
+ match = re.search(pattern, url.replace('.co.', '.'))
119
+
120
+ # Extract the matched domain
121
+ if match:
122
+ domain = match.group(1)
123
+ return domain.split('.')[0]
124
+ else:
125
+ return None
126
+
127
+ def _get_pubdate(self, html):
128
+ # Define a regular expression pattern to match the content attribute value
129
+ pattern = r'content="(\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2})"'
130
+
131
+ # Use re.search to find the first match
132
+ match = re.search(pattern, html)
133
+
134
+ # Extract the matched content attribute value
135
+ if match:
136
+ content_value = match.group(1)
137
+
138
+ # Convert the content value to a datetime format
139
+ datetime_format = "%Y/%m/%d %H:%M:%S"
140
+ parsed_datetime = dt.datetime.strptime(content_value, datetime_format)
141
+
142
+ return parsed_datetime
143
+ else:
144
+ # Define a regular expression pattern to match the content attribute value
145
+ pattern = r'content="(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})"'
146
+
147
+ # Use re.search to find the first match
148
+ match = re.search(pattern, html)
149
+
150
+ # Extract the matched content attribute value
151
+ if match:
152
+ content_value = match.group(1)
153
+
154
+ # Convert the content value to a datetime format
155
+ datetime_format = "%Y-%m-%d %H:%M:%S"
156
+ parsed_datetime = dt.datetime.strptime(content_value, datetime_format)
157
+
158
+ return parsed_datetime
159
+ else:
160
+ # Define a regular expression pattern to match the content attribute value
161
+ pattern = r'content="(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\+\d{2}:\d{2})"'
162
+
163
+ # Use re.search to find the first match
164
+ match = re.search(pattern, html)
165
+
166
+ # Extract the matched content attribute value
167
+ if match:
168
+ content_value = match.group(1)
169
+
170
+ # Convert the content value to a datetime format
171
+ datetime_format = "%Y-%m-%dT%H:%M:%S%z"
172
+ parsed_datetime = dt.datetime.strptime(content_value, datetime_format)
173
+
174
+ return parsed_datetime
175
+ else:
176
+ return None
177
+
178
+ def _get_html(self, url):
179
+ # random useragent
180
+ ua = UserAgent()
181
+ user_agent = ua.random
182
+ headers = {'User-Agent': user_agent}
183
+
184
+ # Try with mechanize first (with proxy if available)
185
+ try:
186
+ br = mechanize.Browser()
187
+ br.set_handle_robots(False)
188
+ br.addheaders = [('User-Agent', user_agent)]
189
+
190
+ # Set proxy for mechanize if available
191
+ if self.proxy:
192
+ br.set_proxies({'http': self.proxy, 'https': self.proxy})
193
+
194
+ html = br.open(url).read().decode('utf-8')
195
+ return html
196
+ except Exception:
197
+ # Fallback to requests (with proxy if available)
198
+ try:
199
+ proxies = {'http': self.proxy, 'https': self.proxy} if self.proxy else None
200
+ response = requests.get(url, headers=headers, proxies=proxies, timeout=30)
201
+ html = response.content.decode('utf-8')
202
+ return html
203
+ except Exception:
204
+ # Final fallback without proxy
205
+ response = requests.get(url, headers=headers, timeout=30)
206
+ html = response.content.decode('utf-8')
207
+ return html
208
+
209
+ def scrape(self):
210
+ # get html from url
211
+ html = self._get_html(self.url)
212
+
213
+ # get metadata
214
+ metadata = self._get_metadata(html)
215
+
216
+ # convert html to text
217
+ text = self._clean_html_to_text(html)
218
+
219
+ # get media from url
220
+ media = self._get_media(self.url)
221
+
222
+ # get published_date from html
223
+ published_date = self._get_pubdate(html)
224
+
225
+ # combine result
226
+ metadata['html'] = html
227
+ metadata['text'] = text
228
+ metadata['media'] = media
229
+ metadata['published_date'] = published_date
230
+
231
+ return metadata