pocong 0.1.2__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pocong
3
- Version: 0.1.2
3
+ Version: 1.0.0
4
4
  Summary: Python Oriented Crawling Ongoing (POCONG): a simple crawling framework
5
5
  Home-page: https://gitlab.com/mohsin3107/pocong
6
6
  Author: Singgih
@@ -19,6 +19,9 @@ Classifier: Topic :: Software Development :: Libraries
19
19
  Classifier: Topic :: Internet :: WWW/HTTP
20
20
  Requires-Python: >=3.8
21
21
  Description-Content-Type: text/markdown
22
+ Requires-Dist: Scrapy>=2.5.0
23
+ Requires-Dist: pandas>=1.3.0
24
+ Requires-Dist: requests>=2.25.0
22
25
  Requires-Dist: Click>=7.0
23
26
  Provides-Extra: dev
24
27
  Requires-Dist: pytest; extra == "dev"
@@ -46,7 +49,7 @@ Dynamic: requires-python
46
49
  Dynamic: summary
47
50
 
48
51
  <p align="center">
49
- <img src="https://gitlab.com/uploads/-/system/project/avatar/73633795/Screenshot_2025-08-22_at_18.40.11.png?width=128" alt="POCONG Logo" width="128"/>
52
+ <img src="https://i.ibb.co.com/35P4Nq9x/Screenshot-2025-08-22-at-18-40-11.png?width=128" alt="POCONG Logo" width="128"/>
50
53
  </p>
51
54
 
52
55
  # POCONG 🪦
@@ -57,3 +60,55 @@ POCONG is a lightweight web crawling framework built in Python.
57
60
  ## Installation
58
61
  ```bash
59
62
  pip install pocong
63
+ ```
64
+
65
+ ## Usage: Get Proxy from proxy_spiders
66
+
67
+ You can use the `get_proxy` and `get_proxy_random` methods from `proxy_spiders` to fetch working proxies.
68
+
69
+ ```python
70
+ from pocong.proxy_spiders import GetProxy
71
+
72
+ gp = GetProxy()
73
+
74
+ # Get the first working proxy
75
+ proxy = gp.get_proxy()
76
+ print("First working proxy:", proxy)
77
+ ```
78
+ ```python
79
+ from pocong.proxy_spiders import GetProxy
80
+
81
+ gp = GetProxy()
82
+
83
+ # Get a random working proxy
84
+ random_proxy = gp.get_proxy_random()
85
+ print("Random working proxy:", random_proxy)
86
+ ```
87
+
88
+ Sample output:
89
+ ```
90
+ First working proxy: {'ip': '123.45.67.89', 'port': '8080', 'https': 'yes', ...}
91
+ Random working proxy: {'ip': '98.76.54.32', 'port': '3128', 'https': 'yes', ...}
92
+ ```
93
+
94
+ You can use the returned proxy dictionary with the `requests` library, for example:
95
+
96
+ ```python
97
+ import requests
98
+
99
+ proxy = gp.get_proxy()
100
+ if proxy:
101
+ proxies = {
102
+ 'http': f"http://{proxy['ip']}:{proxy['port']}",
103
+ 'https': f"http://{proxy['ip']}:{proxy['port']}"
104
+ }
105
+ response = requests.get('https://httpbin.org/ip', proxies=proxies)
106
+ print(response.json())
107
+ else:
108
+ print("No working proxy found.")
109
+ ```
110
+
111
+ - `get_proxy()` will return the first working proxy found.
112
+ - `get_proxy_random()` will return a random working proxy (with up to 20 retries).
113
+
114
+ Both methods return a dictionary with proxy details (e.g., `{ 'ip': '...', 'port': '...', ... }`) or `None` if no working proxy is found.
pocong-1.0.0/README.md ADDED
@@ -0,0 +1,64 @@
1
+ <p align="center">
2
+ <img src="https://i.ibb.co.com/35P4Nq9x/Screenshot-2025-08-22-at-18-40-11.png?width=128" alt="POCONG Logo" width="128"/>
3
+ </p>
4
+
5
+ # POCONG 🪦
6
+ **Python Oriented Crawling ON Going**
7
+
8
+ POCONG is a lightweight web crawling framework built in Python.
9
+
10
+ ## Installation
11
+ ```bash
12
+ pip install pocong
13
+ ```
14
+
15
+ ## Usage: Get Proxy from proxy_spiders
16
+
17
+ You can use the `get_proxy` and `get_proxy_random` methods from `proxy_spiders` to fetch working proxies.
18
+
19
+ ```python
20
+ from pocong.proxy_spiders import GetProxy
21
+
22
+ gp = GetProxy()
23
+
24
+ # Get the first working proxy
25
+ proxy = gp.get_proxy()
26
+ print("First working proxy:", proxy)
27
+ ```
28
+ ```python
29
+ from pocong.proxy_spiders import GetProxy
30
+
31
+ gp = GetProxy()
32
+
33
+ # Get a random working proxy
34
+ random_proxy = gp.get_proxy_random()
35
+ print("Random working proxy:", random_proxy)
36
+ ```
37
+
38
+ Sample output:
39
+ ```
40
+ First working proxy: {'ip': '123.45.67.89', 'port': '8080', 'https': 'yes', ...}
41
+ Random working proxy: {'ip': '98.76.54.32', 'port': '3128', 'https': 'yes', ...}
42
+ ```
43
+
44
+ You can use the returned proxy dictionary with the `requests` library, for example:
45
+
46
+ ```python
47
+ import requests
48
+
49
+ proxy = gp.get_proxy()
50
+ if proxy:
51
+ proxies = {
52
+ 'http': f"http://{proxy['ip']}:{proxy['port']}",
53
+ 'https': f"http://{proxy['ip']}:{proxy['port']}"
54
+ }
55
+ response = requests.get('https://httpbin.org/ip', proxies=proxies)
56
+ print(response.json())
57
+ else:
58
+ print("No working proxy found.")
59
+ ```
60
+
61
+ - `get_proxy()` will return the first working proxy found.
62
+ - `get_proxy_random()` will return a random working proxy (with up to 20 retries).
63
+
64
+ Both methods return a dictionary with proxy details (e.g., `{ 'ip': '...', 'port': '...', ... }`) or `None` if no working proxy is found.
@@ -13,6 +13,9 @@ with open(path.join(here, "README.md"), encoding="utf-8") as f:
13
13
  long_description = f.read()
14
14
 
15
15
  install_requires = [
16
+ "Scrapy>=2.5.0",
17
+ "pandas>=1.3.0",
18
+ "requests>=2.25.0",
16
19
  "Click>=7.0",
17
20
  ]
18
21
 
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-08-23T21:54:34+0700",
11
+ "date": "2025-09-21T17:59:23+0700",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "f5e6ba30b073075bba032c169a3fc3545ec1eaa1",
15
- "version": "0.1.2"
14
+ "full-revisionid": "9a2d0f58b89a546044fc52948ce274767aa450d4",
15
+ "version": "1.0.0"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -0,0 +1,74 @@
1
+ # Init file for spiders module to make it a package
2
+ import random
3
+
4
+ import requests
5
+ import pandas as pd
6
+ from scrapy.crawler import CrawlerProcess
7
+
8
+ from pocong.proxy_spiders.spiders.free_proxy_list_net_spider import ProxySpider
9
+
10
+
11
+ class GetProxy():
12
+ '''
13
+ Class to get proxies using Scrapy spiders and validate them.
14
+ '''
15
+ def __init__(self):
16
+ pass
17
+
18
+ def _check_proxy(self, x):
19
+ proxy = f"http://{x['ip']}:{x['port']}"
20
+ try:
21
+ response = requests.get("https://httpbin.org/ip", proxies={'https': proxy}, timeout=10)
22
+ if response.status_code == 200 and response.json().get('origin') == x['ip']:
23
+ print(f"checking proxy: {proxy} success") # noqa
24
+ return response.status_code
25
+ print(f"checking proxy: {proxy} failed") # noqa
26
+ return 0
27
+ except requests.RequestException:
28
+ print(f"checking proxy: {proxy} failed") # noqa
29
+ return 0
30
+
31
+ def _run_example_spider(self):
32
+ process = CrawlerProcess(settings={
33
+ "LOG_LEVEL": "ERROR",
34
+ "ITEM_PIPELINES": {'pocong.proxy_spiders.pipelines.Pipelines': 1},
35
+ })
36
+ process.crawl(ProxySpider)
37
+ process.start()
38
+ from pocong.proxy_spiders.pipelines import collected_items
39
+ return collected_items
40
+
41
+ def _get_proxy_from_scrape(self):
42
+ items = self._run_example_spider()
43
+ df = pd.DataFrame(items)
44
+ df = df[df['https'] == 'yes']
45
+ df = df.drop_duplicates(subset=['ip', 'port'])
46
+ proxies_json = df.to_dict(orient='records')
47
+ return proxies_json
48
+
49
+ def get_proxy(self):
50
+ '''
51
+ Get a working proxy from the list of proxies.
52
+ parameter: None
53
+ return: dict or None
54
+ '''
55
+ proxies_json = self._get_proxy_from_scrape()
56
+ for proxy in proxies_json:
57
+ if self._check_proxy(proxy) == 200:
58
+ return proxy
59
+
60
+ def get_proxy_random(self):
61
+ '''
62
+ Get a random working proxy from the list of proxies.
63
+ parameter: None
64
+ return: dict or None
65
+ '''
66
+ proxies_json = self._get_proxy_from_scrape()
67
+ retry = 0
68
+ proxy = None
69
+ while retry < 20:
70
+ retry += 1
71
+ proxy = random.choice(proxies_json)
72
+ if self._check_proxy(proxy) == 200:
73
+ break
74
+ return proxy
@@ -0,0 +1,7 @@
1
+ collected_items = []
2
+
3
+
4
+ class Pipelines:
5
+ def process_item(self, item, spider):
6
+ collected_items.append(dict(item))
7
+ return item
@@ -0,0 +1 @@
1
+ # Init file for spiders module to make it a package
@@ -0,0 +1,25 @@
1
+ import scrapy
2
+
3
+
4
+ class ProxySpider(scrapy.Spider):
5
+ name = "example"
6
+ start_urls = [
7
+ 'https://free-proxy-list.net/en/',
8
+ ]
9
+
10
+ def parse(self, response):
11
+ # Extract proxy table rows
12
+ rows = response.css('table tbody tr')
13
+ for row in rows:
14
+ columns = row.css('td')
15
+ if len(columns) >= 8:
16
+ yield {
17
+ 'ip': columns[0].css('::text').get(),
18
+ 'port': columns[1].css('::text').get(),
19
+ 'code': columns[2].css('::text').get(),
20
+ 'country': columns[3].css('::text').get(),
21
+ 'anonymity': columns[4].css('::text').get(),
22
+ 'google': columns[5].css('::text').get(),
23
+ 'https': columns[6].css('::text').get(),
24
+ 'last_checked': columns[7].css('::text').get(),
25
+ }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pocong
3
- Version: 0.1.2
3
+ Version: 1.0.0
4
4
  Summary: Python Oriented Crawling Ongoing (POCONG): a simple crawling framework
5
5
  Home-page: https://gitlab.com/mohsin3107/pocong
6
6
  Author: Singgih
@@ -19,6 +19,9 @@ Classifier: Topic :: Software Development :: Libraries
19
19
  Classifier: Topic :: Internet :: WWW/HTTP
20
20
  Requires-Python: >=3.8
21
21
  Description-Content-Type: text/markdown
22
+ Requires-Dist: Scrapy>=2.5.0
23
+ Requires-Dist: pandas>=1.3.0
24
+ Requires-Dist: requests>=2.25.0
22
25
  Requires-Dist: Click>=7.0
23
26
  Provides-Extra: dev
24
27
  Requires-Dist: pytest; extra == "dev"
@@ -46,7 +49,7 @@ Dynamic: requires-python
46
49
  Dynamic: summary
47
50
 
48
51
  <p align="center">
49
- <img src="https://gitlab.com/uploads/-/system/project/avatar/73633795/Screenshot_2025-08-22_at_18.40.11.png?width=128" alt="POCONG Logo" width="128"/>
52
+ <img src="https://i.ibb.co.com/35P4Nq9x/Screenshot-2025-08-22-at-18-40-11.png?width=128" alt="POCONG Logo" width="128"/>
50
53
  </p>
51
54
 
52
55
  # POCONG 🪦
@@ -57,3 +60,55 @@ POCONG is a lightweight web crawling framework built in Python.
57
60
  ## Installation
58
61
  ```bash
59
62
  pip install pocong
63
+ ```
64
+
65
+ ## Usage: Get Proxy from proxy_spiders
66
+
67
+ You can use the `get_proxy` and `get_proxy_random` methods from `proxy_spiders` to fetch working proxies.
68
+
69
+ ```python
70
+ from pocong.proxy_spiders import GetProxy
71
+
72
+ gp = GetProxy()
73
+
74
+ # Get the first working proxy
75
+ proxy = gp.get_proxy()
76
+ print("First working proxy:", proxy)
77
+ ```
78
+ ```python
79
+ from pocong.proxy_spiders import GetProxy
80
+
81
+ gp = GetProxy()
82
+
83
+ # Get a random working proxy
84
+ random_proxy = gp.get_proxy_random()
85
+ print("Random working proxy:", random_proxy)
86
+ ```
87
+
88
+ Sample output:
89
+ ```
90
+ First working proxy: {'ip': '123.45.67.89', 'port': '8080', 'https': 'yes', ...}
91
+ Random working proxy: {'ip': '98.76.54.32', 'port': '3128', 'https': 'yes', ...}
92
+ ```
93
+
94
+ You can use the returned proxy dictionary with the `requests` library, for example:
95
+
96
+ ```python
97
+ import requests
98
+
99
+ proxy = gp.get_proxy()
100
+ if proxy:
101
+ proxies = {
102
+ 'http': f"http://{proxy['ip']}:{proxy['port']}",
103
+ 'https': f"http://{proxy['ip']}:{proxy['port']}"
104
+ }
105
+ response = requests.get('https://httpbin.org/ip', proxies=proxies)
106
+ print(response.json())
107
+ else:
108
+ print("No working proxy found.")
109
+ ```
110
+
111
+ - `get_proxy()` will return the first working proxy found.
112
+ - `get_proxy_random()` will return a random working proxy (with up to 20 retries).
113
+
114
+ Both methods return a dictionary with proxy details (e.g., `{ 'ip': '...', 'port': '...', ... }`) or `None` if no working proxy is found.
@@ -15,4 +15,9 @@ src/pocong.egg-info/dependency_links.txt
15
15
  src/pocong.egg-info/entry_points.txt
16
16
  src/pocong.egg-info/requires.txt
17
17
  src/pocong.egg-info/top_level.txt
18
- tests/test_pocong.py
18
+ src/pocong/proxy_spiders/__init__.py
19
+ src/pocong/proxy_spiders/pipelines.py
20
+ src/pocong/proxy_spiders/spiders/__init__.py
21
+ src/pocong/proxy_spiders/spiders/free_proxy_list_net_spider.py
22
+ tests/test_pocong.py
23
+ tests/test_proxy_spiders.py
@@ -1,3 +1,6 @@
1
+ Scrapy>=2.5.0
2
+ pandas>=1.3.0
3
+ requests>=2.25.0
1
4
  Click>=7.0
2
5
 
3
6
  [dev]
@@ -0,0 +1,21 @@
1
+
2
+ import requests
3
+ import pytest
4
+
5
+ from pocong.proxy_spiders import GetProxy
6
+
7
+
8
+ def test_get_proxy():
9
+ """
10
+ Test that a random proxy from get_proxy can make a successful request to httpbin.org/ip.
11
+ """
12
+ proxy = GetProxy().get_proxy()
13
+ assert proxy is not None, "No proxy returned by get_proxy()"
14
+ proxies = {'https': f'http://{proxy["ip"]}:{proxy["port"]}'}
15
+ try:
16
+ response = requests.get('https://httpbin.org/ip', proxies=proxies, timeout=10)
17
+ response.raise_for_status()
18
+ assert response.status_code == 200
19
+ assert 'origin' in response.text
20
+ except requests.RequestException as e:
21
+ pytest.skip(f"HTTP proxy request failed: {e}")
pocong-0.1.2/README.md DELETED
@@ -1,12 +0,0 @@
1
- <p align="center">
2
- <img src="https://gitlab.com/uploads/-/system/project/avatar/73633795/Screenshot_2025-08-22_at_18.40.11.png?width=128" alt="POCONG Logo" width="128"/>
3
- </p>
4
-
5
- # POCONG 🪦
6
- **Python Oriented Crawling ON Going**
7
-
8
- POCONG is a lightweight web crawling framework built in Python.
9
-
10
- ## Installation
11
- ```bash
12
- pip install pocong
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes