PyPI - pocong - Versions diffs - 0.1.2__tar.gz → 1.0.0__tar.gz - Mend

pocong 0.1.2tar.gz → 1.0.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{pocong-0.1.2 → pocong-1.0.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pocong
-Version: 0.1.2
+Version: 1.0.0
 Summary: Python Oriented Crawling Ongoing (POCONG): a simple crawling framework
 Home-page: https://gitlab.com/mohsin3107/pocong
 Author: Singgih
@@ -19,6 +19,9 @@ Classifier: Topic :: Software Development :: Libraries
 Classifier: Topic :: Internet :: WWW/HTTP
 Requires-Python: >=3.8
 Description-Content-Type: text/markdown
+Requires-Dist: Scrapy>=2.5.0
+Requires-Dist: pandas>=1.3.0
+Requires-Dist: requests>=2.25.0
 Requires-Dist: Click>=7.0
 Provides-Extra: dev
 Requires-Dist: pytest; extra == "dev"
@@ -46,7 +49,7 @@ Dynamic: requires-python
 Dynamic: summary
 <p align="center">
-  <img src="https://gitlab.com/uploads/-/system/project/avatar/73633795/Screenshot_2025-08-22_at_18.40.11.png?width=128" alt="POCONG Logo" width="128"/>
+  <img src="https://i.ibb.co.com/35P4Nq9x/Screenshot-2025-08-22-at-18-40-11.png?width=128" alt="POCONG Logo" width="128"/>
 </p>
 # POCONG 🪦
@@ -57,3 +60,55 @@ POCONG is a lightweight web crawling framework built in Python.
 ## Installation
 ```bash
 pip install pocong
+```
+## Usage: Get Proxy from proxy_spiders
+You can use the `get_proxy` and `get_proxy_random` methods from `proxy_spiders` to fetch working proxies.
+```python
+from pocong.proxy_spiders import GetProxy
+gp = GetProxy()
+# Get the first working proxy
+proxy = gp.get_proxy()
+print("First working proxy:", proxy)
+```
+```python
+from pocong.proxy_spiders import GetProxy
+gp = GetProxy()
+# Get a random working proxy
+random_proxy = gp.get_proxy_random()
+print("Random working proxy:", random_proxy)
+```
+Sample output:
+```
+First working proxy: {'ip': '123.45.67.89', 'port': '8080', 'https': 'yes', ...}
+Random working proxy: {'ip': '98.76.54.32', 'port': '3128', 'https': 'yes', ...}
+```
+You can use the returned proxy dictionary with the `requests` library, for example:
+```python
+import requests
+proxy = gp.get_proxy()
+if proxy:
+    proxies = {
+        'http': f"http://{proxy['ip']}:{proxy['port']}",
+        'https': f"http://{proxy['ip']}:{proxy['port']}"
+    }
+    response = requests.get('https://httpbin.org/ip', proxies=proxies)
+    print(response.json())
+else:
+    print("No working proxy found.")
+```
+- `get_proxy()` will return the first working proxy found.
+- `get_proxy_random()` will return a random working proxy (with up to 20 retries).
+Both methods return a dictionary with proxy details (e.g., `{ 'ip': '...', 'port': '...', ... }`) or `None` if no working proxy is found.

pocong-1.0.0/README.md ADDED Viewed

@@ -0,0 +1,64 @@
+<p align="center">
+  <img src="https://i.ibb.co.com/35P4Nq9x/Screenshot-2025-08-22-at-18-40-11.png?width=128" alt="POCONG Logo" width="128"/>
+</p>
+# POCONG 🪦
+**Python Oriented Crawling ON Going**
+POCONG is a lightweight web crawling framework built in Python.
+## Installation
+```bash
+pip install pocong
+```
+## Usage: Get Proxy from proxy_spiders
+You can use the `get_proxy` and `get_proxy_random` methods from `proxy_spiders` to fetch working proxies.
+```python
+from pocong.proxy_spiders import GetProxy
+gp = GetProxy()
+# Get the first working proxy
+proxy = gp.get_proxy()
+print("First working proxy:", proxy)
+```
+```python
+from pocong.proxy_spiders import GetProxy
+gp = GetProxy()
+# Get a random working proxy
+random_proxy = gp.get_proxy_random()
+print("Random working proxy:", random_proxy)
+```
+Sample output:
+```
+First working proxy: {'ip': '123.45.67.89', 'port': '8080', 'https': 'yes', ...}
+Random working proxy: {'ip': '98.76.54.32', 'port': '3128', 'https': 'yes', ...}
+```
+You can use the returned proxy dictionary with the `requests` library, for example:
+```python
+import requests
+proxy = gp.get_proxy()
+if proxy:
+    proxies = {
+        'http': f"http://{proxy['ip']}:{proxy['port']}",
+        'https': f"http://{proxy['ip']}:{proxy['port']}"
+    }
+    response = requests.get('https://httpbin.org/ip', proxies=proxies)
+    print(response.json())
+else:
+    print("No working proxy found.")
+```
+- `get_proxy()` will return the first working proxy found.
+- `get_proxy_random()` will return a random working proxy (with up to 20 retries).
+Both methods return a dictionary with proxy details (e.g., `{ 'ip': '...', 'port': '...', ... }`) or `None` if no working proxy is found.

{pocong-0.1.2 → pocong-1.0.0}/setup.py RENAMED Viewed

@@ -13,6 +13,9 @@ with open(path.join(here, "README.md"), encoding="utf-8") as f:
     long_description = f.read()
 install_requires = [
+    "Scrapy>=2.5.0",
+    "pandas>=1.3.0",
+    "requests>=2.25.0",
     "Click>=7.0",
 ]

{pocong-0.1.2 → pocong-1.0.0}/src/pocong/_version.py RENAMED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2025-08-23T21:54:34+0700",
+ "date": "2025-09-21T17:59:23+0700",
  "dirty": false,
  "error": null,
- "full-revisionid": "f5e6ba30b073075bba032c169a3fc3545ec1eaa1",
- "version": "0.1.2"
+ "full-revisionid": "9a2d0f58b89a546044fc52948ce274767aa450d4",
+ "version": "1.0.0"
 }
 '''  # END VERSION_JSON

pocong-1.0.0/src/pocong/proxy_spiders/__init__.py ADDED Viewed

@@ -0,0 +1,74 @@
+# Init file for spiders module to make it a package
+import random
+import requests
+import pandas as pd
+from scrapy.crawler import CrawlerProcess
+from pocong.proxy_spiders.spiders.free_proxy_list_net_spider import ProxySpider
+class GetProxy():
+    '''
+    Class to get proxies using Scrapy spiders and validate them.
+    '''
+    def __init__(self):
+        pass
+    def _check_proxy(self, x):
+        proxy = f"http://{x['ip']}:{x['port']}"
+        try:
+            response = requests.get("https://httpbin.org/ip", proxies={'https': proxy}, timeout=10)
+            if response.status_code == 200 and response.json().get('origin') == x['ip']:
+                print(f"checking proxy: {proxy} success")  # noqa
+                return response.status_code
+            print(f"checking proxy: {proxy} failed")  # noqa
+            return 0
+        except requests.RequestException:
+            print(f"checking proxy: {proxy} failed")  # noqa
+            return 0
+    def _run_example_spider(self):
+        process = CrawlerProcess(settings={
+            "LOG_LEVEL": "ERROR",
+            "ITEM_PIPELINES": {'pocong.proxy_spiders.pipelines.Pipelines': 1},
+        })
+        process.crawl(ProxySpider)
+        process.start()
+        from pocong.proxy_spiders.pipelines import collected_items
+        return collected_items
+    def _get_proxy_from_scrape(self):
+        items = self._run_example_spider()
+        df = pd.DataFrame(items)
+        df = df[df['https'] == 'yes']
+        df = df.drop_duplicates(subset=['ip', 'port'])
+        proxies_json = df.to_dict(orient='records')
+        return proxies_json
+    def get_proxy(self):
+        '''
+        Get a working proxy from the list of proxies.
+        parameter: None
+        return: dict or None
+        '''
+        proxies_json = self._get_proxy_from_scrape()
+        for proxy in proxies_json:
+            if self._check_proxy(proxy) == 200:
+                return proxy
+    def get_proxy_random(self):
+        '''
+        Get a random working proxy from the list of proxies.
+        parameter: None
+        return: dict or None
+        '''
+        proxies_json = self._get_proxy_from_scrape()
+        retry = 0
+        proxy = None
+        while retry < 20:
+            retry += 1
+            proxy = random.choice(proxies_json)
+            if self._check_proxy(proxy) == 200:
+                break
+        return proxy

pocong-1.0.0/src/pocong/proxy_spiders/pipelines.py ADDED Viewed

@@ -0,0 +1,7 @@
+collected_items = []
+class Pipelines:
+    def process_item(self, item, spider):
+        collected_items.append(dict(item))
+        return item

pocong-1.0.0/src/pocong/proxy_spiders/spiders/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ # Init file for spiders module to make it a package

pocong-1.0.0/src/pocong/proxy_spiders/spiders/free_proxy_list_net_spider.py ADDED Viewed

@@ -0,0 +1,25 @@
+import scrapy
+class ProxySpider(scrapy.Spider):
+    name = "example"
+    start_urls = [
+        'https://free-proxy-list.net/en/',
+    ]
+    def parse(self, response):
+        # Extract proxy table rows
+        rows = response.css('table tbody tr')
+        for row in rows:
+            columns = row.css('td')
+            if len(columns) >= 8:
+                yield {
+                    'ip': columns[0].css('::text').get(),
+                    'port': columns[1].css('::text').get(),
+                    'code': columns[2].css('::text').get(),
+                    'country': columns[3].css('::text').get(),
+                    'anonymity': columns[4].css('::text').get(),
+                    'google': columns[5].css('::text').get(),
+                    'https': columns[6].css('::text').get(),
+                    'last_checked': columns[7].css('::text').get(),
+                }

{pocong-0.1.2 → pocong-1.0.0}/src/pocong.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pocong
-Version: 0.1.2
+Version: 1.0.0
 Summary: Python Oriented Crawling Ongoing (POCONG): a simple crawling framework
 Home-page: https://gitlab.com/mohsin3107/pocong
 Author: Singgih
@@ -19,6 +19,9 @@ Classifier: Topic :: Software Development :: Libraries
 Classifier: Topic :: Internet :: WWW/HTTP
 Requires-Python: >=3.8
 Description-Content-Type: text/markdown
+Requires-Dist: Scrapy>=2.5.0
+Requires-Dist: pandas>=1.3.0
+Requires-Dist: requests>=2.25.0
 Requires-Dist: Click>=7.0
 Provides-Extra: dev
 Requires-Dist: pytest; extra == "dev"
@@ -46,7 +49,7 @@ Dynamic: requires-python
 Dynamic: summary
 <p align="center">
-  <img src="https://gitlab.com/uploads/-/system/project/avatar/73633795/Screenshot_2025-08-22_at_18.40.11.png?width=128" alt="POCONG Logo" width="128"/>
+  <img src="https://i.ibb.co.com/35P4Nq9x/Screenshot-2025-08-22-at-18-40-11.png?width=128" alt="POCONG Logo" width="128"/>
 </p>
 # POCONG 🪦
@@ -57,3 +60,55 @@ POCONG is a lightweight web crawling framework built in Python.
 ## Installation
 ```bash
 pip install pocong
+```
+## Usage: Get Proxy from proxy_spiders
+You can use the `get_proxy` and `get_proxy_random` methods from `proxy_spiders` to fetch working proxies.
+```python
+from pocong.proxy_spiders import GetProxy
+gp = GetProxy()
+# Get the first working proxy
+proxy = gp.get_proxy()
+print("First working proxy:", proxy)
+```
+```python
+from pocong.proxy_spiders import GetProxy
+gp = GetProxy()
+# Get a random working proxy
+random_proxy = gp.get_proxy_random()
+print("Random working proxy:", random_proxy)
+```
+Sample output:
+```
+First working proxy: {'ip': '123.45.67.89', 'port': '8080', 'https': 'yes', ...}
+Random working proxy: {'ip': '98.76.54.32', 'port': '3128', 'https': 'yes', ...}
+```
+You can use the returned proxy dictionary with the `requests` library, for example:
+```python
+import requests
+proxy = gp.get_proxy()
+if proxy:
+    proxies = {
+        'http': f"http://{proxy['ip']}:{proxy['port']}",
+        'https': f"http://{proxy['ip']}:{proxy['port']}"
+    }
+    response = requests.get('https://httpbin.org/ip', proxies=proxies)
+    print(response.json())
+else:
+    print("No working proxy found.")
+```
+- `get_proxy()` will return the first working proxy found.
+- `get_proxy_random()` will return a random working proxy (with up to 20 retries).
+Both methods return a dictionary with proxy details (e.g., `{ 'ip': '...', 'port': '...', ... }`) or `None` if no working proxy is found.

{pocong-0.1.2 → pocong-1.0.0}/src/pocong.egg-info/SOURCES.txt RENAMED Viewed

@@ -15,4 +15,9 @@ src/pocong.egg-info/dependency_links.txt
 src/pocong.egg-info/entry_points.txt
 src/pocong.egg-info/requires.txt
 src/pocong.egg-info/top_level.txt
-tests/test_pocong.py
+src/pocong/proxy_spiders/__init__.py
+src/pocong/proxy_spiders/pipelines.py
+src/pocong/proxy_spiders/spiders/__init__.py
+src/pocong/proxy_spiders/spiders/free_proxy_list_net_spider.py
+tests/test_pocong.py
+tests/test_proxy_spiders.py

{pocong-0.1.2 → pocong-1.0.0}/src/pocong.egg-info/requires.txt RENAMED Viewed

@@ -1,3 +1,6 @@
+Scrapy>=2.5.0
+pandas>=1.3.0
+requests>=2.25.0
 Click>=7.0
 [dev]

pocong-1.0.0/tests/test_proxy_spiders.py ADDED Viewed

@@ -0,0 +1,21 @@
+import requests
+import pytest
+from pocong.proxy_spiders import GetProxy
+def test_get_proxy():
+    """
+    Test that a random proxy from get_proxy can make a successful request to httpbin.org/ip.
+    """
+    proxy = GetProxy().get_proxy()
+    assert proxy is not None, "No proxy returned by get_proxy()"
+    proxies = {'https': f'http://{proxy["ip"]}:{proxy["port"]}'}
+    try:
+        response = requests.get('https://httpbin.org/ip', proxies=proxies, timeout=10)
+        response.raise_for_status()
+        assert response.status_code == 200
+        assert 'origin' in response.text
+    except requests.RequestException as e:
+        pytest.skip(f"HTTP proxy request failed: {e}")

pocong-0.1.2/README.md DELETED Viewed

@@ -1,12 +0,0 @@
-<p align="center">
-  <img src="https://gitlab.com/uploads/-/system/project/avatar/73633795/Screenshot_2025-08-22_at_18.40.11.png?width=128" alt="POCONG Logo" width="128"/>
-</p>
-# POCONG 🪦
-**Python Oriented Crawling ON Going**
-POCONG is a lightweight web crawling framework built in Python.
-## Installation
-```bash
-pip install pocong