pocong 0.1.3__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pocong/_version.py CHANGED
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-08-23T21:57:15+0700",
11
+ "date": "2025-09-21T17:59:23+0700",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "49273c80614427be28b9ed5ed8db6214c15148d4",
15
- "version": "0.1.3"
14
+ "full-revisionid": "9a2d0f58b89a546044fc52948ce274767aa450d4",
15
+ "version": "1.0.0"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -0,0 +1,74 @@
1
+ # Init file for spiders module to make it a package
2
+ import random
3
+
4
+ import requests
5
+ import pandas as pd
6
+ from scrapy.crawler import CrawlerProcess
7
+
8
+ from pocong.proxy_spiders.spiders.free_proxy_list_net_spider import ProxySpider
9
+
10
+
11
+ class GetProxy():
12
+ '''
13
+ Class to get proxies using Scrapy spiders and validate them.
14
+ '''
15
+ def __init__(self):
16
+ pass
17
+
18
+ def _check_proxy(self, x):
19
+ proxy = f"http://{x['ip']}:{x['port']}"
20
+ try:
21
+ response = requests.get("https://httpbin.org/ip", proxies={'https': proxy}, timeout=10)
22
+ if response.status_code == 200 and response.json().get('origin') == x['ip']:
23
+ print(f"checking proxy: {proxy} success") # noqa
24
+ return response.status_code
25
+ print(f"checking proxy: {proxy} failed") # noqa
26
+ return 0
27
+ except requests.RequestException:
28
+ print(f"checking proxy: {proxy} failed") # noqa
29
+ return 0
30
+
31
+ def _run_example_spider(self):
32
+ process = CrawlerProcess(settings={
33
+ "LOG_LEVEL": "ERROR",
34
+ "ITEM_PIPELINES": {'pocong.proxy_spiders.pipelines.Pipelines': 1},
35
+ })
36
+ process.crawl(ProxySpider)
37
+ process.start()
38
+ from pocong.proxy_spiders.pipelines import collected_items
39
+ return collected_items
40
+
41
+ def _get_proxy_from_scrape(self):
42
+ items = self._run_example_spider()
43
+ df = pd.DataFrame(items)
44
+ df = df[df['https'] == 'yes']
45
+ df = df.drop_duplicates(subset=['ip', 'port'])
46
+ proxies_json = df.to_dict(orient='records')
47
+ return proxies_json
48
+
49
+ def get_proxy(self):
50
+ '''
51
+ Get a working proxy from the list of proxies.
52
+ parameter: None
53
+ return: dict or None
54
+ '''
55
+ proxies_json = self._get_proxy_from_scrape()
56
+ for proxy in proxies_json:
57
+ if self._check_proxy(proxy) == 200:
58
+ return proxy
59
+
60
+ def get_proxy_random(self):
61
+ '''
62
+ Get a random working proxy from the list of proxies.
63
+ parameter: None
64
+ return: dict or None
65
+ '''
66
+ proxies_json = self._get_proxy_from_scrape()
67
+ retry = 0
68
+ proxy = None
69
+ while retry < 20:
70
+ retry += 1
71
+ proxy = random.choice(proxies_json)
72
+ if self._check_proxy(proxy) == 200:
73
+ break
74
+ return proxy
@@ -0,0 +1,7 @@
1
+ collected_items = []
2
+
3
+
4
+ class Pipelines:
5
+ def process_item(self, item, spider):
6
+ collected_items.append(dict(item))
7
+ return item
@@ -0,0 +1 @@
1
+ # Init file for spiders module to make it a package
@@ -0,0 +1,25 @@
1
+ import scrapy
2
+
3
+
4
+ class ProxySpider(scrapy.Spider):
5
+ name = "example"
6
+ start_urls = [
7
+ 'https://free-proxy-list.net/en/',
8
+ ]
9
+
10
+ def parse(self, response):
11
+ # Extract proxy table rows
12
+ rows = response.css('table tbody tr')
13
+ for row in rows:
14
+ columns = row.css('td')
15
+ if len(columns) >= 8:
16
+ yield {
17
+ 'ip': columns[0].css('::text').get(),
18
+ 'port': columns[1].css('::text').get(),
19
+ 'code': columns[2].css('::text').get(),
20
+ 'country': columns[3].css('::text').get(),
21
+ 'anonymity': columns[4].css('::text').get(),
22
+ 'google': columns[5].css('::text').get(),
23
+ 'https': columns[6].css('::text').get(),
24
+ 'last_checked': columns[7].css('::text').get(),
25
+ }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pocong
3
- Version: 0.1.3
3
+ Version: 1.0.0
4
4
  Summary: Python Oriented Crawling Ongoing (POCONG): a simple crawling framework
5
5
  Home-page: https://gitlab.com/mohsin3107/pocong
6
6
  Author: Singgih
@@ -19,6 +19,9 @@ Classifier: Topic :: Software Development :: Libraries
19
19
  Classifier: Topic :: Internet :: WWW/HTTP
20
20
  Requires-Python: >=3.8
21
21
  Description-Content-Type: text/markdown
22
+ Requires-Dist: Scrapy>=2.5.0
23
+ Requires-Dist: pandas>=1.3.0
24
+ Requires-Dist: requests>=2.25.0
22
25
  Requires-Dist: Click>=7.0
23
26
  Provides-Extra: dev
24
27
  Requires-Dist: pytest; extra == "dev"
@@ -57,3 +60,55 @@ POCONG is a lightweight web crawling framework built in Python.
57
60
  ## Installation
58
61
  ```bash
59
62
  pip install pocong
63
+ ```
64
+
65
+ ## Usage: Get Proxy from proxy_spiders
66
+
67
+ You can use the `get_proxy` and `get_proxy_random` methods from `proxy_spiders` to fetch working proxies.
68
+
69
+ ```python
70
+ from pocong.proxy_spiders import GetProxy
71
+
72
+ gp = GetProxy()
73
+
74
+ # Get the first working proxy
75
+ proxy = gp.get_proxy()
76
+ print("First working proxy:", proxy)
77
+ ```
78
+ ```python
79
+ from pocong.proxy_spiders import GetProxy
80
+
81
+ gp = GetProxy()
82
+
83
+ # Get a random working proxy
84
+ random_proxy = gp.get_proxy_random()
85
+ print("Random working proxy:", random_proxy)
86
+ ```
87
+
88
+ Sample output:
89
+ ```
90
+ First working proxy: {'ip': '123.45.67.89', 'port': '8080', 'https': 'yes', ...}
91
+ Random working proxy: {'ip': '98.76.54.32', 'port': '3128', 'https': 'yes', ...}
92
+ ```
93
+
94
+ You can use the returned proxy dictionary with the `requests` library, for example:
95
+
96
+ ```python
97
+ import requests
98
+
99
+ proxy = gp.get_proxy()
100
+ if proxy:
101
+ proxies = {
102
+ 'http': f"http://{proxy['ip']}:{proxy['port']}",
103
+ 'https': f"http://{proxy['ip']}:{proxy['port']}"
104
+ }
105
+ response = requests.get('https://httpbin.org/ip', proxies=proxies)
106
+ print(response.json())
107
+ else:
108
+ print("No working proxy found.")
109
+ ```
110
+
111
+ - `get_proxy()` will return the first working proxy found.
112
+ - `get_proxy_random()` will return a random working proxy (with up to 20 retries).
113
+
114
+ Both methods return a dictionary with proxy details (e.g., `{ 'ip': '...', 'port': '...', ... }`) or `None` if no working proxy is found.
@@ -0,0 +1,14 @@
1
+ pocong/__init__.py,sha256=Hl0PkSkg6LV6IRLzXnGc0K2GY-drxkZEpt5qTAVDUkY,109
2
+ pocong/_version.py,sha256=hFgXEoBMIBnyj4LP_PTO6GNR5nrvefsXEARVrbSCj7o,497
3
+ pocong/cli.py,sha256=_f_aU4pckbQ_baF9oHwbqwmBFiQFn5Irvi-v5rDZ70o,529
4
+ pocong/pocong.py,sha256=h0hwdogXGFqerm-5ZPeT-irPn91pCcQRjiHThXsRzEk,19
5
+ pocong/utils.py,sha256=MAbbL9PXRWnBpJKgI869ZfY42Eph73zcbJyK0jH2Nak,35
6
+ pocong/proxy_spiders/__init__.py,sha256=q3ifQZd4_TipTmVYklCdvgVZEkuLR91Qo0LwM4CBMnA,2384
7
+ pocong/proxy_spiders/pipelines.py,sha256=k8DRupjvN7qnIk0uFNJ_3JEFlDadtO0PCBH0iOsPKp4,145
8
+ pocong/proxy_spiders/spiders/__init__.py,sha256=4-oTTycftRXl_6z92SjSi_XmDfP-1xAaVj39HMggLWc,52
9
+ pocong/proxy_spiders/spiders/free_proxy_list_net_spider.py,sha256=AV-8_KF7UMRkdcuCaqdhGtbsMawpJt9G3NF6S7aVQO4,886
10
+ pocong-1.0.0.dist-info/METADATA,sha256=2A3QXiD-vF7rgHM6sN4Yt9-O8KbRm_5DGaKGfCaKMjw,3411
11
+ pocong-1.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
+ pocong-1.0.0.dist-info/entry_points.txt,sha256=Q3F4OQIZJzlnS2tnEuTzcn2tN4S5Btd08o_9Otdb4bM,43
13
+ pocong-1.0.0.dist-info/top_level.txt,sha256=ZMo2AlCPGpM4N7hHVSNoIjbM1D90yjFhRra0YmCfTO4,7
14
+ pocong-1.0.0.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- pocong/__init__.py,sha256=Hl0PkSkg6LV6IRLzXnGc0K2GY-drxkZEpt5qTAVDUkY,109
2
- pocong/_version.py,sha256=6DzWhBmSmCgH-UnOBP9FE_v5R2nrgAZOwvaIfjiu-CE,497
3
- pocong/cli.py,sha256=_f_aU4pckbQ_baF9oHwbqwmBFiQFn5Irvi-v5rDZ70o,529
4
- pocong/pocong.py,sha256=h0hwdogXGFqerm-5ZPeT-irPn91pCcQRjiHThXsRzEk,19
5
- pocong/utils.py,sha256=MAbbL9PXRWnBpJKgI869ZfY42Eph73zcbJyK0jH2Nak,35
6
- pocong-0.1.3.dist-info/METADATA,sha256=BsjoWSXB7UYifsd7F5thMiWK3rTyfjX5n3Kfl47gG_0,1925
7
- pocong-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
8
- pocong-0.1.3.dist-info/entry_points.txt,sha256=Q3F4OQIZJzlnS2tnEuTzcn2tN4S5Btd08o_9Otdb4bM,43
9
- pocong-0.1.3.dist-info/top_level.txt,sha256=ZMo2AlCPGpM4N7hHVSNoIjbM1D90yjFhRra0YmCfTO4,7
10
- pocong-0.1.3.dist-info/RECORD,,
File without changes