pocong 0.1.3__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pocong/_version.py CHANGED
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-08-23T21:57:15+0700",
11
+ "date": "2025-09-21T22:32:32+0700",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "49273c80614427be28b9ed5ed8db6214c15148d4",
15
- "version": "0.1.3"
14
+ "full-revisionid": "85eac04a9a369e96c0c4819feecca0cc5d23c173",
15
+ "version": "1.0.1"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -0,0 +1,72 @@
1
+ # Init file for spiders module to make it a package
2
+ import random
3
+
4
+ import requests
5
+
6
+
7
+ class GetProxy():
8
+ '''
9
+ Class to get proxies using Scrapy spiders and validate them.
10
+ '''
11
+ def __init__(self):
12
+ self._items = []
13
+
14
+ def _check_proxy(self, x):
15
+ proxy = f"http://{x['ip']}:{x['port']}"
16
+ try:
17
+ response = requests.get("https://httpbin.org/ip", proxies={'https': proxy}, timeout=10)
18
+ if response.status_code == 200 and response.json().get('origin') == x['ip']:
19
+ return response.status_code
20
+ return 0
21
+ except requests.RequestException:
22
+ return 0
23
+
24
+ def _get_proxy_from_scrape(self):
25
+ import subprocess
26
+ import sys
27
+ import json
28
+ # Run the spider in a subprocess to avoid reactor restart error
29
+ code = (
30
+ 'import pandas as pd;'
31
+ 'from scrapy.crawler import CrawlerProcess;'
32
+ 'from pocong.proxy_spiders.spiders.free_proxy_list_net_spider import ProxySpider;'
33
+ 'from pocong.proxy_spiders.pipelines import collected_items;'
34
+ 'process = CrawlerProcess(settings={"LOG_LEVEL": "ERROR", "ITEM_PIPELINES": {"pocong.proxy_spiders.pipelines.Pipelines": 1}});' # noqa: E501
35
+ 'process.crawl(ProxySpider);'
36
+ 'process.start();'
37
+ 'process.stop();'
38
+ 'df = pd.DataFrame(collected_items);'
39
+ 'df = df[df["https"] == "yes"];'
40
+ 'df = df.drop_duplicates(subset=["ip", "port"]);'
41
+ 'print(df.to_json(orient="records"))'
42
+ )
43
+ result = subprocess.run([sys.executable, '-c', code], capture_output=True, text=True)
44
+ proxies_json = json.loads(result.stdout.strip()) if result.stdout.strip() else []
45
+ return proxies_json
46
+
47
+ def get_proxy(self):
48
+ '''
49
+ Get a working proxy from the list of proxies.
50
+ parameter: None
51
+ return: dict or None
52
+ '''
53
+ proxies_json = self._get_proxy_from_scrape()
54
+ for proxy in proxies_json:
55
+ if self._check_proxy(proxy) == 200:
56
+ return proxy
57
+
58
+ def get_proxy_random(self):
59
+ '''
60
+ Get a random working proxy from the list of proxies.
61
+ parameter: None
62
+ return: dict or None
63
+ '''
64
+ proxies_json = self._get_proxy_from_scrape()
65
+ retry = 0
66
+ proxy = None
67
+ while retry < 20:
68
+ retry += 1
69
+ proxy = random.choice(proxies_json)
70
+ if self._check_proxy(proxy) == 200:
71
+ break
72
+ return proxy
@@ -0,0 +1,7 @@
1
+ collected_items = []
2
+
3
+
4
+ class Pipelines:
5
+ def process_item(self, item, spider):
6
+ collected_items.append(dict(item))
7
+ return item
@@ -0,0 +1 @@
1
+ # Init file for spiders module to make it a package
@@ -0,0 +1,25 @@
1
+ import scrapy
2
+
3
+
4
+ class ProxySpider(scrapy.Spider):
5
+ name = "example"
6
+ start_urls = [
7
+ 'https://free-proxy-list.net/en/',
8
+ ]
9
+
10
+ def parse(self, response):
11
+ # Extract proxy table rows
12
+ rows = response.css('table tbody tr')
13
+ for row in rows:
14
+ columns = row.css('td')
15
+ if len(columns) >= 8:
16
+ yield {
17
+ 'ip': columns[0].css('::text').get(),
18
+ 'port': columns[1].css('::text').get(),
19
+ 'code': columns[2].css('::text').get(),
20
+ 'country': columns[3].css('::text').get(),
21
+ 'anonymity': columns[4].css('::text').get(),
22
+ 'google': columns[5].css('::text').get(),
23
+ 'https': columns[6].css('::text').get(),
24
+ 'last_checked': columns[7].css('::text').get(),
25
+ }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pocong
3
- Version: 0.1.3
3
+ Version: 1.0.1
4
4
  Summary: Python Oriented Crawling Ongoing (POCONG): a simple crawling framework
5
5
  Home-page: https://gitlab.com/mohsin3107/pocong
6
6
  Author: Singgih
@@ -19,6 +19,9 @@ Classifier: Topic :: Software Development :: Libraries
19
19
  Classifier: Topic :: Internet :: WWW/HTTP
20
20
  Requires-Python: >=3.8
21
21
  Description-Content-Type: text/markdown
22
+ Requires-Dist: Scrapy>=2.5.0
23
+ Requires-Dist: pandas>=1.3.0
24
+ Requires-Dist: requests>=2.25.0
22
25
  Requires-Dist: Click>=7.0
23
26
  Provides-Extra: dev
24
27
  Requires-Dist: pytest; extra == "dev"
@@ -57,3 +60,55 @@ POCONG is a lightweight web crawling framework built in Python.
57
60
  ## Installation
58
61
  ```bash
59
62
  pip install pocong
63
+ ```
64
+
65
+ ## Usage: Get Proxy from proxy_spiders
66
+
67
+ You can use the `get_proxy` and `get_proxy_random` methods from `proxy_spiders` to fetch working proxies.
68
+
69
+ ```python
70
+ from pocong.proxy_spiders import GetProxy
71
+
72
+ gp = GetProxy()
73
+
74
+ # Get the first working proxy
75
+ proxy = gp.get_proxy()
76
+ print("First working proxy:", proxy)
77
+ ```
78
+ ```python
79
+ from pocong.proxy_spiders import GetProxy
80
+
81
+ gp = GetProxy()
82
+
83
+ # Get a random working proxy
84
+ random_proxy = gp.get_proxy_random()
85
+ print("Random working proxy:", random_proxy)
86
+ ```
87
+
88
+ Sample output:
89
+ ```
90
+ First working proxy: {'ip': '123.45.67.89', 'port': '8080', 'https': 'yes', ...}
91
+ Random working proxy: {'ip': '98.76.54.32', 'port': '3128', 'https': 'yes', ...}
92
+ ```
93
+
94
+ You can use the returned proxy dictionary with the `requests` library, for example:
95
+
96
+ ```python
97
+ import requests
98
+
99
+ proxy = gp.get_proxy()
100
+ if proxy:
101
+ proxies = {
102
+ 'http': f"http://{proxy['ip']}:{proxy['port']}",
103
+ 'https': f"http://{proxy['ip']}:{proxy['port']}"
104
+ }
105
+ response = requests.get('https://httpbin.org/ip', proxies=proxies)
106
+ print(response.json())
107
+ else:
108
+ print("No working proxy found.")
109
+ ```
110
+
111
+ - `get_proxy()` will return the first working proxy found.
112
+ - `get_proxy_random()` will return a random working proxy (with up to 20 retries).
113
+
114
+ Both methods return a dictionary with proxy details (e.g., `{ 'ip': '...', 'port': '...', ... }`) or `None` if no working proxy is found.
@@ -0,0 +1,14 @@
1
+ pocong/__init__.py,sha256=Hl0PkSkg6LV6IRLzXnGc0K2GY-drxkZEpt5qTAVDUkY,109
2
+ pocong/_version.py,sha256=t_ZIUHG9ovHL3x_o5mb9F_Ih_UhgDKE9j9yI8c92xyk,497
3
+ pocong/cli.py,sha256=_f_aU4pckbQ_baF9oHwbqwmBFiQFn5Irvi-v5rDZ70o,529
4
+ pocong/pocong.py,sha256=h0hwdogXGFqerm-5ZPeT-irPn91pCcQRjiHThXsRzEk,19
5
+ pocong/utils.py,sha256=MAbbL9PXRWnBpJKgI869ZfY42Eph73zcbJyK0jH2Nak,35
6
+ pocong/proxy_spiders/__init__.py,sha256=RBlvqba1wIhCJPn4n5LNB3SLiCQHDcGOhtrFydWY7T4,2556
7
+ pocong/proxy_spiders/pipelines.py,sha256=k8DRupjvN7qnIk0uFNJ_3JEFlDadtO0PCBH0iOsPKp4,145
8
+ pocong/proxy_spiders/spiders/__init__.py,sha256=4-oTTycftRXl_6z92SjSi_XmDfP-1xAaVj39HMggLWc,52
9
+ pocong/proxy_spiders/spiders/free_proxy_list_net_spider.py,sha256=AV-8_KF7UMRkdcuCaqdhGtbsMawpJt9G3NF6S7aVQO4,886
10
+ pocong-1.0.1.dist-info/METADATA,sha256=-t70ukwfphmnul9khbMmbYyIOIowevrC9eriXZrv5so,3411
11
+ pocong-1.0.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
+ pocong-1.0.1.dist-info/entry_points.txt,sha256=Q3F4OQIZJzlnS2tnEuTzcn2tN4S5Btd08o_9Otdb4bM,43
13
+ pocong-1.0.1.dist-info/top_level.txt,sha256=ZMo2AlCPGpM4N7hHVSNoIjbM1D90yjFhRra0YmCfTO4,7
14
+ pocong-1.0.1.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- pocong/__init__.py,sha256=Hl0PkSkg6LV6IRLzXnGc0K2GY-drxkZEpt5qTAVDUkY,109
2
- pocong/_version.py,sha256=6DzWhBmSmCgH-UnOBP9FE_v5R2nrgAZOwvaIfjiu-CE,497
3
- pocong/cli.py,sha256=_f_aU4pckbQ_baF9oHwbqwmBFiQFn5Irvi-v5rDZ70o,529
4
- pocong/pocong.py,sha256=h0hwdogXGFqerm-5ZPeT-irPn91pCcQRjiHThXsRzEk,19
5
- pocong/utils.py,sha256=MAbbL9PXRWnBpJKgI869ZfY42Eph73zcbJyK0jH2Nak,35
6
- pocong-0.1.3.dist-info/METADATA,sha256=BsjoWSXB7UYifsd7F5thMiWK3rTyfjX5n3Kfl47gG_0,1925
7
- pocong-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
8
- pocong-0.1.3.dist-info/entry_points.txt,sha256=Q3F4OQIZJzlnS2tnEuTzcn2tN4S5Btd08o_9Otdb4bM,43
9
- pocong-0.1.3.dist-info/top_level.txt,sha256=ZMo2AlCPGpM4N7hHVSNoIjbM1D90yjFhRra0YmCfTO4,7
10
- pocong-0.1.3.dist-info/RECORD,,
File without changes