abstract-webtools 0.1.6.138__tar.gz → 0.1.6.140__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/PKG-INFO +1 -1
  2. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/setup.py +1 -1
  3. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/__init__.py +1 -0
  4. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/linkManager/linkManager.py +46 -24
  5. abstract_webtools-0.1.6.140/src/abstract_webtools/managers/middleManager/__init__.py +1 -0
  6. abstract_webtools-0.1.6.140/src/abstract_webtools/managers/middleManager/imports.py +18 -0
  7. abstract_webtools-0.1.6.140/src/abstract_webtools/managers/middleManager/src/UnifiedWebManager.py +135 -0
  8. abstract_webtools-0.1.6.140/src/abstract_webtools/managers/middleManager/src/__init__.py +2 -0
  9. abstract_webtools-0.1.6.140/src/abstract_webtools/managers/middleManager/src/legacy_tools.py +8 -0
  10. abstract_webtools-0.1.6.140/src/abstract_webtools/managers/requestManager/requestManager.py +400 -0
  11. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/soupManager/soupManager.py +10 -10
  12. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/urlManager/urlManager.py +20 -28
  13. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools.egg-info/PKG-INFO +1 -1
  14. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools.egg-info/SOURCES.txt +5 -0
  15. abstract_webtools-0.1.6.138/src/abstract_webtools/managers/requestManager/requestManager.py +0 -356
  16. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/README.md +0 -0
  17. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/pyproject.toml +0 -0
  18. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/setup.cfg +0 -0
  19. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/__init__.py +0 -0
  20. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/abstract_usurpit.py +0 -0
  21. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/abstract_webtools.py +0 -0
  22. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/big_user_agent_list.py +0 -0
  23. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/domain_identifier.py +0 -0
  24. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/extention_list.py +0 -0
  25. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/find_dirs.py +0 -0
  26. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/k2s_downloader.py +0 -0
  27. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/main.py +0 -0
  28. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/allss//.py" +0 -0
  29. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/cipherManager.py +0 -0
  30. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/clownworld/__init__.py +0 -0
  31. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/clownworld/get_bolshevid_video.py +0 -0
  32. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/crawlManager.py +0 -0
  33. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/crawlmgr2.py +0 -0
  34. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/curlMgr.py +0 -0
  35. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/domainManager.py +0 -0
  36. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/dynamicRateLimiter.py +0 -0
  37. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/get_test.py +0 -0
  38. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/linkManager/__init__.py +0 -0
  39. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/mySocketClient.py +0 -0
  40. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/networkManager.py +0 -0
  41. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/requestManager/__init__.py +0 -0
  42. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/seleniumManager.py +0 -0
  43. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/soupManager/__init__.py +0 -0
  44. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/soupManager/asoueces.py +0 -0
  45. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/sslManager.py +0 -0
  46. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/tlsAdapter.py +0 -0
  47. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/urlManager/__init__.py +0 -0
  48. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/urlManager/urlManager (Copy).py +0 -0
  49. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/userAgentManager.py +0 -0
  50. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/videoDownloader.py +0 -0
  51. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/videoDownloader2.py +0 -0
  52. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/soup_gui.py +0 -0
  53. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/url_grabber.py +0 -0
  54. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/url_grabber_new.py +0 -0
  55. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools.egg-info/dependency_links.txt +0 -0
  56. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools.egg-info/requires.txt +0 -0
  57. {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: abstract_webtools
3
- Version: 0.1.6.138
3
+ Version: 0.1.6.140
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff
@@ -4,7 +4,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
4
4
  long_description = fh.read()
5
5
  setuptools.setup(
6
6
  name='abstract_webtools',
7
- version='0.1.6.138',
7
+ version='0.1.6.140',
8
8
  author='putkoff',
9
9
  author_email='partners@abstractendeavors.com',
10
10
  description='Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.',
@@ -12,3 +12,4 @@ from .urlManager import *
12
12
  from .userAgentManager import *
13
13
  from .seleniumManager import *
14
14
  from .videoDownloader import *
15
+ from .middleManager import *
@@ -2,18 +2,7 @@ from ...abstract_webtools import *
2
2
  from ..urlManager import *
3
3
  from ..requestManager import *
4
4
  from ..soupManager import *
5
- def get_url(url=None,url_mgr=None):
6
- if not url and not url_mgr:
7
- return None
8
- if url:
9
- url_mgr = urlManager(url)
10
- return url_mgr.url
11
- def get_url_mgr(url=None,url_mgr=None):
12
- if not url and not url_mgr:
13
- return None
14
- if url:
15
- url_mgr = urlManager(url)
16
- return url_mgr
5
+ from ..middleManager import *
17
6
  class linkManager:
18
7
  """
19
8
  LinkManager is a class for managing and extracting links and image links from a web page.
@@ -50,13 +39,40 @@ class linkManager:
50
39
  - The LinkManager class helps manage and extract links and image links from web pages.
51
40
  - The class provides flexibility in specifying criteria for link extraction.
52
41
  """
53
- def __init__(self,url=None,source_code=None,url_mgr=None,req_mgr=None,soup_manager=None,image_link_tags='img',img_link_attrs='src',link_tags='a',link_attrs='href',strict_order_tags=False,img_attr_value_desired=None,img_attr_value_undesired=None,link_attr_value_desired=None,link_attr_value_undesired=None,associated_data_attr=["data-title",'alt','title'],get_img=["data-title",'alt','title']):
54
- self.url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
55
- self.url=self.url_mgr.url
56
- self.req_mgr = get_req_mgr(req_mgr=req_mgr,url=self.url,url_mgr=self.url_mgr,source_code=source_code)
57
- self.source_code = source_code or self.req_mgr.source_code or self.req_mgr.source_code_bytes
58
- self.soup_mgr = get_soup_mgr(req_mgr=self.req_mgr,url_mgr=self.url_mgr,source_code = source_code)
42
+ def __init__(self,
43
+ url=None,
44
+ source_code=None,
45
+ soup=None,
46
+ url_mgr=None,
47
+ req_mgr=None,
48
+ soup_mgr=None,
49
+ image_link_tags='img',
50
+ img_link_attrs='src',
51
+ link_tags='a',
52
+ link_attrs='href',
53
+ strict_order_tags=False,
54
+ img_attr_value_desired=None,
55
+ img_attr_value_undesired=None,
56
+ link_attr_value_desired=None,
57
+ link_attr_value_undesired=None,
58
+ associated_data_attr=["data-title",'alt','title'],
59
+ get_img=["data-title",'alt','title']
60
+ ):
61
+
62
+
63
+ all_tools = get_soup_tools(
64
+ url=url,
65
+ url_mgr=url_mgr,
66
+ source_code=source_code,
67
+ req_mgr=req_mgr,
68
+ soup=soup,
69
+ soup_mgr=soup_mgr,
70
+ target_manager = self
71
+ )
59
72
 
73
+
74
+
75
+
60
76
  self.strict_order_tags=strict_order_tags
61
77
  self.image_link_tags=image_link_tags
62
78
  self.img_link_attrs=img_link_attrs
@@ -171,19 +187,25 @@ class linkManager:
171
187
  if url not in domains_ls and comp_domain == domain:
172
188
  domains_ls.append(url)
173
189
  return domains_ls
190
+
174
191
  def find_all_desired_links(self,tag='img', attr='src',attr_value_desired=None,strict_order_tags=False,attr_value_undesired=None,associated_data_attr=None,all_desired=None,get_img=None):
175
192
  all_desired = all_desired or self.find_all_desired(tag=tag,attr=attr,strict_order_tags=strict_order_tags,attr_value_desired=attr_value_desired,attr_value_undesired=attr_value_undesired,associated_data_attr=associated_data_attr,get_img=get_img)
176
193
  assiciated_attrs = all_desired[-1]
177
194
  valid_assiciated_attrs = []
178
195
  desired_links=[]
179
196
  for i,attr in enumerate(all_desired[:-1]):
180
- valid_attr=self.url_mgr.make_valid(attr,self.url_mgr.protocol+'://'+self.url_mgr.domain)
181
- if valid_attr:
182
- desired_links.append(valid_attr)
183
- valid_assiciated_attrs.append(assiciated_attrs[i])
184
- valid_assiciated_attrs[-1]["link"]=valid_attr
197
+
198
+ self.url_mgr.domain = self.url_mgr.domain or ''
199
+
200
+ self.url_mgr.protocol = self.url_mgr.protocol or 'https'
201
+
202
+ if attr:
203
+ valid_attr=self.url_mgr.make_valid(attr,self.url_mgr.protocol+'://'+self.url_mgr.domain)
204
+ if valid_attr:
205
+ desired_links.append(valid_attr)
206
+ valid_assiciated_attrs.append(assiciated_attrs[i])
207
+ valid_assiciated_attrs[-1]["link"]=valid_attr
185
208
  desired_links.append(valid_assiciated_attrs)
186
209
  return desired_links
187
210
 
188
-
189
211
 
@@ -0,0 +1,18 @@
1
+ import logging
2
+ from ..urlManager import (
3
+ urlManager,
4
+ get_url,
5
+ get_url_mgr
6
+ )
7
+ from ..requestManager import (
8
+ requestManager,
9
+ get_source,
10
+ get_req_mgr
11
+ )
12
+ from ..soupManager import (
13
+ soupManager,
14
+ get_soup,
15
+ get_soup_mgr
16
+ )
17
+ from bs4 import BeautifulSoup
18
+ logging.basicConfig(level=logging.INFO)
@@ -0,0 +1,135 @@
1
+ from ..imports import *
2
+
3
+ class UnifiedWebManager:
4
+ """
5
+ Unified middleware that ties together URL, request, and soup managers.
6
+ Lazily initializes components based on provided inputs.
7
+
8
+ Args:
9
+ url (str or None): The base URL.
10
+ source_code (str or bytes or None): Pre-fetched source code.
11
+ url_mgr (urlManager or None): Existing URL manager.
12
+ req_mgr (requestManager or None): Existing request manager.
13
+ soup_mgr (soupManager or None): Existing soup manager.
14
+ parse_type (str): Parser type for BeautifulSoup (default: "html.parser").
15
+ """
16
+ def __init__(self, url=None, source_code=None, url_mgr=None, req_mgr=None, soup_mgr=None,soup=None, parse_type="html.parser"):
17
+ self._url = url
18
+ self._source_code = source_code
19
+ self._url_mgr = url_mgr
20
+ self._req_mgr = req_mgr
21
+ self._soup_mgr = soup_mgr
22
+ self._parse_type = parse_type
23
+ self._soup = None # Lazy
24
+
25
+ @property
26
+ def url_mgr(self):
27
+ if self._url_mgr is None:
28
+ if self._url is None:
29
+ logging.warning("No URL provided; URL manager cannot be created.")
30
+ return None
31
+ self._url_mgr = urlManager(url=self._url)
32
+ return self._url_mgr
33
+
34
+ @property
35
+ def url(self):
36
+ if self._url is None and self.url_mgr:
37
+ self._url = self.url_mgr.url
38
+ return self._url
39
+
40
+ @property
41
+ def req_mgr(self):
42
+ if self._req_mgr is None:
43
+ self._req_mgr = requestManager(
44
+ url=self.url,
45
+ url_mgr=self.url_mgr,
46
+ source_code=self._source_code
47
+ )
48
+ return self._req_mgr
49
+
50
+ @property
51
+ def source_code(self):
52
+ if self._source_code is None and self.req_mgr:
53
+ self._source_code = self.req_mgr.source_code
54
+ return self._source_code
55
+
56
+ @property
57
+ def soup_mgr(self):
58
+ if self._soup_mgr is None:
59
+ self._soup_mgr = soupManager(
60
+ url=self.url,
61
+ url_mgr=self.url_mgr,
62
+ req_mgr=self.req_mgr,
63
+ source_code=self.source_code
64
+ )
65
+ return self._soup_mgr
66
+
67
+ @property
68
+ def soup(self):
69
+ if self._soup is None:
70
+ source = self.source_code
71
+ if source is None:
72
+ logging.warning("No source code available; Soup cannot be created.")
73
+ return None
74
+ if isinstance(source, bytes):
75
+ source = source.decode('utf-8', errors='ignore')
76
+ self._soup = BeautifulSoup(source, self._parse_type)
77
+ return self._soup
78
+
79
+ def update_url(self, url):
80
+ """Update the URL and reset dependent managers."""
81
+ self._url = url
82
+ self._url_mgr = None
83
+ self._req_mgr = None
84
+ self._soup_mgr = None
85
+ self._source_code = None
86
+ self._soup = None
87
+
88
+ def update_source_code(self, source_code):
89
+ """Update the source code and reset dependent managers."""
90
+ self._source_code = source_code
91
+ self._req_mgr = None
92
+ self._soup_mgr = None
93
+ self._soup = None
94
+
95
+ # Convenience methods for direct access
96
+ def get_all_tools(self):
97
+ """Return a dict with all components (similar to original getters)."""
98
+ return {
99
+ 'url': self.url,
100
+ 'url_mgr': self.url_mgr,
101
+ 'source_code': self.source_code,
102
+ 'req_mgr': self.req_mgr,
103
+ 'soup': self.soup,
104
+ 'soup_mgr': self.soup_mgr
105
+ }
106
+ def endow_to_manager(self, target_manager, all_tools=None):
107
+ """
108
+ Endow (assign) the attributes from all_tools to the target manager instance.
109
+
110
+ Args:
111
+ target_manager: The instance (e.g., another manager class) to endow attributes to.
112
+ all_tools (dict or None): Optional dict of tools/attributes. If None, uses self.get_all_tools().
113
+ """
114
+ if all_tools is None:
115
+ all_tools = self.get_all_tools()
116
+ for key, value in all_tools.items():
117
+ setattr(target_manager, key, value)
118
+ # Wrapper functions for backward compatibility
119
+ def get_url_tools(url=None, url_mgr=None):
120
+ mgr = UnifiedWebManager(url=url, url_mgr=url_mgr)
121
+ return {'url': mgr.url, 'url_mgr': mgr.url_mgr}
122
+
123
+ def get_req_tools(url=None, url_mgr=None, source_code=None, req_mgr=None):
124
+ mgr = UnifiedWebManager(url=url, url_mgr=url_mgr, source_code=source_code, req_mgr=req_mgr)
125
+ return {'url': mgr.url, 'url_mgr': mgr.url_mgr, 'source_code': mgr.source_code, 'req_mgr': mgr.req_mgr}
126
+
127
+ def get_soup_tools(url=None, url_mgr=None, source_code=None, req_mgr=None, soup=None, soup_mgr=None,target_manager=None):
128
+ mgr = UnifiedWebManager(url=url, url_mgr=url_mgr, source_code=source_code, req_mgr=req_mgr, soup_mgr=soup_mgr)
129
+ if soup is not None:
130
+ mgr._soup = soup # Allow overriding
131
+ if target_manager:
132
+ mgr.endow_to_manager(target_manager, all_tools=None)
133
+ return mgr.get_all_tools()
134
+
135
+
@@ -0,0 +1,2 @@
1
+ from .UnifiedWebManager import *
2
+ from .legacy_tools import *
@@ -0,0 +1,8 @@
1
+ from ..imports import (
2
+ get_url,
3
+ get_soup_mgr,
4
+ get_source,
5
+ get_req_mgr,
6
+ get_soup,
7
+ get_soup_mgr
8
+ )