pocong 0.1.3__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pocong/_version.py +3 -3
- pocong/proxy_spiders/__init__.py +74 -0
- pocong/proxy_spiders/pipelines.py +7 -0
- pocong/proxy_spiders/spiders/__init__.py +1 -0
- pocong/proxy_spiders/spiders/free_proxy_list_net_spider.py +25 -0
- {pocong-0.1.3.dist-info → pocong-1.0.0.dist-info}/METADATA +56 -1
- pocong-1.0.0.dist-info/RECORD +14 -0
- pocong-0.1.3.dist-info/RECORD +0 -10
- {pocong-0.1.3.dist-info → pocong-1.0.0.dist-info}/WHEEL +0 -0
- {pocong-0.1.3.dist-info → pocong-1.0.0.dist-info}/entry_points.txt +0 -0
- {pocong-0.1.3.dist-info → pocong-1.0.0.dist-info}/top_level.txt +0 -0
pocong/_version.py
CHANGED
@@ -8,11 +8,11 @@ import json
|
|
8
8
|
|
9
9
|
version_json = '''
|
10
10
|
{
|
11
|
-
"date": "2025-
|
11
|
+
"date": "2025-09-21T17:59:23+0700",
|
12
12
|
"dirty": false,
|
13
13
|
"error": null,
|
14
|
-
"full-revisionid": "
|
15
|
-
"version": "0.
|
14
|
+
"full-revisionid": "9a2d0f58b89a546044fc52948ce274767aa450d4",
|
15
|
+
"version": "1.0.0"
|
16
16
|
}
|
17
17
|
''' # END VERSION_JSON
|
18
18
|
|
@@ -0,0 +1,74 @@
|
|
1
|
+
# Init file for spiders module to make it a package
|
2
|
+
import random
|
3
|
+
|
4
|
+
import requests
|
5
|
+
import pandas as pd
|
6
|
+
from scrapy.crawler import CrawlerProcess
|
7
|
+
|
8
|
+
from pocong.proxy_spiders.spiders.free_proxy_list_net_spider import ProxySpider
|
9
|
+
|
10
|
+
|
11
|
+
class GetProxy():
|
12
|
+
'''
|
13
|
+
Class to get proxies using Scrapy spiders and validate them.
|
14
|
+
'''
|
15
|
+
def __init__(self):
|
16
|
+
pass
|
17
|
+
|
18
|
+
def _check_proxy(self, x):
|
19
|
+
proxy = f"http://{x['ip']}:{x['port']}"
|
20
|
+
try:
|
21
|
+
response = requests.get("https://httpbin.org/ip", proxies={'https': proxy}, timeout=10)
|
22
|
+
if response.status_code == 200 and response.json().get('origin') == x['ip']:
|
23
|
+
print(f"checking proxy: {proxy} success") # noqa
|
24
|
+
return response.status_code
|
25
|
+
print(f"checking proxy: {proxy} failed") # noqa
|
26
|
+
return 0
|
27
|
+
except requests.RequestException:
|
28
|
+
print(f"checking proxy: {proxy} failed") # noqa
|
29
|
+
return 0
|
30
|
+
|
31
|
+
def _run_example_spider(self):
|
32
|
+
process = CrawlerProcess(settings={
|
33
|
+
"LOG_LEVEL": "ERROR",
|
34
|
+
"ITEM_PIPELINES": {'pocong.proxy_spiders.pipelines.Pipelines': 1},
|
35
|
+
})
|
36
|
+
process.crawl(ProxySpider)
|
37
|
+
process.start()
|
38
|
+
from pocong.proxy_spiders.pipelines import collected_items
|
39
|
+
return collected_items
|
40
|
+
|
41
|
+
def _get_proxy_from_scrape(self):
|
42
|
+
items = self._run_example_spider()
|
43
|
+
df = pd.DataFrame(items)
|
44
|
+
df = df[df['https'] == 'yes']
|
45
|
+
df = df.drop_duplicates(subset=['ip', 'port'])
|
46
|
+
proxies_json = df.to_dict(orient='records')
|
47
|
+
return proxies_json
|
48
|
+
|
49
|
+
def get_proxy(self):
|
50
|
+
'''
|
51
|
+
Get a working proxy from the list of proxies.
|
52
|
+
parameter: None
|
53
|
+
return: dict or None
|
54
|
+
'''
|
55
|
+
proxies_json = self._get_proxy_from_scrape()
|
56
|
+
for proxy in proxies_json:
|
57
|
+
if self._check_proxy(proxy) == 200:
|
58
|
+
return proxy
|
59
|
+
|
60
|
+
def get_proxy_random(self):
|
61
|
+
'''
|
62
|
+
Get a random working proxy from the list of proxies.
|
63
|
+
parameter: None
|
64
|
+
return: dict or None
|
65
|
+
'''
|
66
|
+
proxies_json = self._get_proxy_from_scrape()
|
67
|
+
retry = 0
|
68
|
+
proxy = None
|
69
|
+
while retry < 20:
|
70
|
+
retry += 1
|
71
|
+
proxy = random.choice(proxies_json)
|
72
|
+
if self._check_proxy(proxy) == 200:
|
73
|
+
break
|
74
|
+
return proxy
|
@@ -0,0 +1 @@
|
|
1
|
+
# Init file for spiders module to make it a package
|
@@ -0,0 +1,25 @@
|
|
1
|
+
import scrapy
|
2
|
+
|
3
|
+
|
4
|
+
class ProxySpider(scrapy.Spider):
|
5
|
+
name = "example"
|
6
|
+
start_urls = [
|
7
|
+
'https://free-proxy-list.net/en/',
|
8
|
+
]
|
9
|
+
|
10
|
+
def parse(self, response):
|
11
|
+
# Extract proxy table rows
|
12
|
+
rows = response.css('table tbody tr')
|
13
|
+
for row in rows:
|
14
|
+
columns = row.css('td')
|
15
|
+
if len(columns) >= 8:
|
16
|
+
yield {
|
17
|
+
'ip': columns[0].css('::text').get(),
|
18
|
+
'port': columns[1].css('::text').get(),
|
19
|
+
'code': columns[2].css('::text').get(),
|
20
|
+
'country': columns[3].css('::text').get(),
|
21
|
+
'anonymity': columns[4].css('::text').get(),
|
22
|
+
'google': columns[5].css('::text').get(),
|
23
|
+
'https': columns[6].css('::text').get(),
|
24
|
+
'last_checked': columns[7].css('::text').get(),
|
25
|
+
}
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: pocong
|
3
|
-
Version: 0.
|
3
|
+
Version: 1.0.0
|
4
4
|
Summary: Python Oriented Crawling Ongoing (POCONG): a simple crawling framework
|
5
5
|
Home-page: https://gitlab.com/mohsin3107/pocong
|
6
6
|
Author: Singgih
|
@@ -19,6 +19,9 @@ Classifier: Topic :: Software Development :: Libraries
|
|
19
19
|
Classifier: Topic :: Internet :: WWW/HTTP
|
20
20
|
Requires-Python: >=3.8
|
21
21
|
Description-Content-Type: text/markdown
|
22
|
+
Requires-Dist: Scrapy>=2.5.0
|
23
|
+
Requires-Dist: pandas>=1.3.0
|
24
|
+
Requires-Dist: requests>=2.25.0
|
22
25
|
Requires-Dist: Click>=7.0
|
23
26
|
Provides-Extra: dev
|
24
27
|
Requires-Dist: pytest; extra == "dev"
|
@@ -57,3 +60,55 @@ POCONG is a lightweight web crawling framework built in Python.
|
|
57
60
|
## Installation
|
58
61
|
```bash
|
59
62
|
pip install pocong
|
63
|
+
```
|
64
|
+
|
65
|
+
## Usage: Get Proxy from proxy_spiders
|
66
|
+
|
67
|
+
You can use the `get_proxy` and `get_proxy_random` methods from `proxy_spiders` to fetch working proxies.
|
68
|
+
|
69
|
+
```python
|
70
|
+
from pocong.proxy_spiders import GetProxy
|
71
|
+
|
72
|
+
gp = GetProxy()
|
73
|
+
|
74
|
+
# Get the first working proxy
|
75
|
+
proxy = gp.get_proxy()
|
76
|
+
print("First working proxy:", proxy)
|
77
|
+
```
|
78
|
+
```python
|
79
|
+
from pocong.proxy_spiders import GetProxy
|
80
|
+
|
81
|
+
gp = GetProxy()
|
82
|
+
|
83
|
+
# Get a random working proxy
|
84
|
+
random_proxy = gp.get_proxy_random()
|
85
|
+
print("Random working proxy:", random_proxy)
|
86
|
+
```
|
87
|
+
|
88
|
+
Sample output:
|
89
|
+
```
|
90
|
+
First working proxy: {'ip': '123.45.67.89', 'port': '8080', 'https': 'yes', ...}
|
91
|
+
Random working proxy: {'ip': '98.76.54.32', 'port': '3128', 'https': 'yes', ...}
|
92
|
+
```
|
93
|
+
|
94
|
+
You can use the returned proxy dictionary with the `requests` library, for example:
|
95
|
+
|
96
|
+
```python
|
97
|
+
import requests
|
98
|
+
|
99
|
+
proxy = gp.get_proxy()
|
100
|
+
if proxy:
|
101
|
+
proxies = {
|
102
|
+
'http': f"http://{proxy['ip']}:{proxy['port']}",
|
103
|
+
'https': f"http://{proxy['ip']}:{proxy['port']}"
|
104
|
+
}
|
105
|
+
response = requests.get('https://httpbin.org/ip', proxies=proxies)
|
106
|
+
print(response.json())
|
107
|
+
else:
|
108
|
+
print("No working proxy found.")
|
109
|
+
```
|
110
|
+
|
111
|
+
- `get_proxy()` will return the first working proxy found.
|
112
|
+
- `get_proxy_random()` will return a random working proxy (with up to 20 retries).
|
113
|
+
|
114
|
+
Both methods return a dictionary with proxy details (e.g., `{ 'ip': '...', 'port': '...', ... }`) or `None` if no working proxy is found.
|
@@ -0,0 +1,14 @@
|
|
1
|
+
pocong/__init__.py,sha256=Hl0PkSkg6LV6IRLzXnGc0K2GY-drxkZEpt5qTAVDUkY,109
|
2
|
+
pocong/_version.py,sha256=hFgXEoBMIBnyj4LP_PTO6GNR5nrvefsXEARVrbSCj7o,497
|
3
|
+
pocong/cli.py,sha256=_f_aU4pckbQ_baF9oHwbqwmBFiQFn5Irvi-v5rDZ70o,529
|
4
|
+
pocong/pocong.py,sha256=h0hwdogXGFqerm-5ZPeT-irPn91pCcQRjiHThXsRzEk,19
|
5
|
+
pocong/utils.py,sha256=MAbbL9PXRWnBpJKgI869ZfY42Eph73zcbJyK0jH2Nak,35
|
6
|
+
pocong/proxy_spiders/__init__.py,sha256=q3ifQZd4_TipTmVYklCdvgVZEkuLR91Qo0LwM4CBMnA,2384
|
7
|
+
pocong/proxy_spiders/pipelines.py,sha256=k8DRupjvN7qnIk0uFNJ_3JEFlDadtO0PCBH0iOsPKp4,145
|
8
|
+
pocong/proxy_spiders/spiders/__init__.py,sha256=4-oTTycftRXl_6z92SjSi_XmDfP-1xAaVj39HMggLWc,52
|
9
|
+
pocong/proxy_spiders/spiders/free_proxy_list_net_spider.py,sha256=AV-8_KF7UMRkdcuCaqdhGtbsMawpJt9G3NF6S7aVQO4,886
|
10
|
+
pocong-1.0.0.dist-info/METADATA,sha256=2A3QXiD-vF7rgHM6sN4Yt9-O8KbRm_5DGaKGfCaKMjw,3411
|
11
|
+
pocong-1.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
12
|
+
pocong-1.0.0.dist-info/entry_points.txt,sha256=Q3F4OQIZJzlnS2tnEuTzcn2tN4S5Btd08o_9Otdb4bM,43
|
13
|
+
pocong-1.0.0.dist-info/top_level.txt,sha256=ZMo2AlCPGpM4N7hHVSNoIjbM1D90yjFhRra0YmCfTO4,7
|
14
|
+
pocong-1.0.0.dist-info/RECORD,,
|
pocong-0.1.3.dist-info/RECORD
DELETED
@@ -1,10 +0,0 @@
|
|
1
|
-
pocong/__init__.py,sha256=Hl0PkSkg6LV6IRLzXnGc0K2GY-drxkZEpt5qTAVDUkY,109
|
2
|
-
pocong/_version.py,sha256=6DzWhBmSmCgH-UnOBP9FE_v5R2nrgAZOwvaIfjiu-CE,497
|
3
|
-
pocong/cli.py,sha256=_f_aU4pckbQ_baF9oHwbqwmBFiQFn5Irvi-v5rDZ70o,529
|
4
|
-
pocong/pocong.py,sha256=h0hwdogXGFqerm-5ZPeT-irPn91pCcQRjiHThXsRzEk,19
|
5
|
-
pocong/utils.py,sha256=MAbbL9PXRWnBpJKgI869ZfY42Eph73zcbJyK0jH2Nak,35
|
6
|
-
pocong-0.1.3.dist-info/METADATA,sha256=BsjoWSXB7UYifsd7F5thMiWK3rTyfjX5n3Kfl47gG_0,1925
|
7
|
-
pocong-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
8
|
-
pocong-0.1.3.dist-info/entry_points.txt,sha256=Q3F4OQIZJzlnS2tnEuTzcn2tN4S5Btd08o_9Otdb4bM,43
|
9
|
-
pocong-0.1.3.dist-info/top_level.txt,sha256=ZMo2AlCPGpM4N7hHVSNoIjbM1D90yjFhRra0YmCfTO4,7
|
10
|
-
pocong-0.1.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|