blue-assistant 4.273.1__py3-none-any.whl → 4.283.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,7 +4,7 @@ ICON = "🧠"
4
4
 
5
5
  DESCRIPTION = f"{ICON} An AI Assistant."
6
6
 
7
- VERSION = "4.273.1"
7
+ VERSION = "4.283.1"
8
8
 
9
9
  REPO_NAME = "blue-assistant"
10
10
 
@@ -34,7 +34,7 @@ def web_crawl(
34
34
  logger.error(f"{node_name}: {seed_url_var_name}: seed_urls not found in vars.")
35
35
  return False
36
36
  seed_urls = script.vars[seed_url_var_name]
37
- log_list(logger, seed_urls, "seed url(s)")
37
+ log_list(logger, "using", seed_urls, "seed url(s)")
38
38
 
39
39
  visited_urls = crawl_list_of_urls(
40
40
  seed_urls=seed_urls,
@@ -50,6 +50,7 @@ args = parser.parse_args()
50
50
 
51
51
  success = False
52
52
  if args.task == "crawl":
53
+ success = True
53
54
  dict_of_urls = crawl_list_of_urls(
54
55
  seed_urls=args.seed_urls.split("+"),
55
56
  object_name=args.object_name,
@@ -58,13 +59,7 @@ if args.task == "crawl":
58
59
  )
59
60
 
60
61
  if args.verbose == 1:
61
- log_dict(logger, dict_of_urls, "url(s)")
62
-
63
- success = post_to_object(
64
- args.object_name,
65
- NAME.replace(".", "-"),
66
- dict_of_urls,
67
- )
62
+ log_dict(logger, "crawled", dict_of_urls, "url(s)")
68
63
  elif args.task == "fetch":
69
64
  summary = fetch_links_and_text(
70
65
  url=args.url,
@@ -1,6 +1,7 @@
1
- from typing import List, Dict, Set
1
+ from typing import List, Dict
2
2
 
3
3
  from blueness import module
4
+ from blue_options.logger import log_dict, log_list
4
5
  from blue_objects import file
5
6
  from blue_objects import objects
6
7
  from blue_objects.metadata import get_from_object, post_to_object
@@ -30,24 +31,33 @@ def crawl_list_of_urls(
30
31
  )
31
32
  )
32
33
 
33
- crawl_cache: Dict[str, str] = (
34
- get_from_object(
34
+ crawl_cache: Dict[str, str] = {}
35
+ queue: List[str] = [url for url in seed_urls]
36
+
37
+ if use_cache:
38
+ crawl_cache = get_from_object(
35
39
  object_name,
36
40
  "crawl_cache",
37
41
  {},
38
42
  )
39
- if use_cache
40
- else {}
41
- )
43
+ log_dict(logger, "loaded cache:", crawl_cache, "url(s)")
42
44
 
43
- queue: Set[str] = set(seed_urls)
45
+ queue += get_from_object(
46
+ object_name,
47
+ "crawl_queue",
48
+ [],
49
+ )
50
+
51
+ log_list(logger, "queue:", queue, "url(s)")
44
52
 
45
53
  iteration: int = 0
46
54
  while queue:
47
- url = queue.pop()
55
+ url = queue[0]
56
+ queue = queue[1:]
57
+
48
58
  logger.info(
49
59
  "{} {} ...".format(
50
- "✅" if url in crawl_cache else "🔗",
60
+ "✅ " if url in crawl_cache else "🔗 ",
51
61
  url,
52
62
  )
53
63
  )
@@ -70,8 +80,17 @@ def crawl_list_of_urls(
70
80
  )
71
81
 
72
82
  crawl_cache[url] = content_type
73
- if "links" in url_summary:
74
- queue.update(url_summary["links"] - crawl_cache.keys())
83
+ if "list_of_urls" in url_summary:
84
+ queue = list(
85
+ set(
86
+ queue
87
+ + [
88
+ url
89
+ for url in url_summary["list_of_urls"]
90
+ if url not in crawl_cache.keys()
91
+ ]
92
+ )
93
+ )
75
94
 
76
95
  iteration += 1
77
96
  if max_iterations != -1 and iteration >= max_iterations:
@@ -88,4 +107,13 @@ def crawl_list_of_urls(
88
107
  crawl_cache,
89
108
  )
90
109
 
110
+ post_to_object(
111
+ object_name,
112
+ "crawl_queue",
113
+ queue,
114
+ )
115
+
116
+ log_dict(logger, "crawled", crawl_cache, "url(s)")
117
+ log_list(logger, "queue:", queue, "url(s)")
118
+
91
119
  return crawl_cache
@@ -1,4 +1,4 @@
1
- from typing import Dict, Any
1
+ from typing import Dict, Any, List
2
2
  import requests
3
3
  from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning
4
4
  from urllib.parse import urljoin
@@ -33,7 +33,8 @@ def fetch_links_and_text(
33
33
  content_type = response.headers.get("Content-Type", "")
34
34
  logger.info(f"content-type: {content_type}")
35
35
 
36
- links = set()
36
+ list_of_urls: List[str] = []
37
+ list_of_ignored_urls: List[str] = []
37
38
  text = ""
38
39
 
39
40
  if not any(
@@ -49,11 +50,15 @@ def fetch_links_and_text(
49
50
  a_url = urljoin(url, a_tag["href"])
50
51
 
51
52
  if a_url.startswith(url):
52
- logger.info(f"+= {a_url}")
53
- links.add(a_url)
53
+ if url not in list_of_urls:
54
+ logger.info(f"+= {a_url}")
55
+ list_of_urls += [a_url]
54
56
  continue
55
57
 
56
- logger.info(f"ignored: {a_url}")
58
+ if a_url not in list_of_ignored_urls:
59
+ list_of_ignored_urls += [a_url]
60
+ if verbose:
61
+ logger.info(f"ignored: {a_url}")
57
62
 
58
63
  text = soup.get_text(separator=" ", strip=True)
59
64
 
@@ -64,12 +69,22 @@ def fetch_links_and_text(
64
69
  text = re.sub(r"\s+", " ", text).strip()
65
70
 
66
71
  if verbose:
67
- log_list(logger, list(links), "link(s)")
72
+ log_list(logger, "fetched", list_of_urls, "url(s)")
73
+ log_list(logger, "ignored", list_of_ignored_urls, "url(s)")
68
74
  log_long_text(logger, text)
75
+ else:
76
+ logger.info(
77
+ "{} url(s) collected, {} url(s) ignored, text: {:,} char(s).".format(
78
+ len(list_of_urls),
79
+ len(list_of_ignored_urls),
80
+ len(text),
81
+ )
82
+ )
69
83
 
70
84
  return {
71
85
  "url": url,
72
86
  "content_type": content_type,
73
- "links": links,
87
+ "list_of_ignored_urls": list_of_ignored_urls,
88
+ "list_of_urls": list_of_urls,
74
89
  "text": text,
75
90
  }
@@ -1,19 +1,4 @@
1
- from typing import List, Dict, Set, Tuple
2
- import requests
3
- from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning
4
- from urllib.parse import urljoin
5
1
  import re
6
- import warnings
7
-
8
- from blueness import module
9
- from blue_options.logger import log_long_text, log_list
10
-
11
- from blue_assistant import NAME
12
- from blue_assistant.logger import logger
13
-
14
- warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
15
-
16
- NAME = module.name(__file__, NAME)
17
2
 
18
3
 
19
4
  def url_to_filename(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: blue_assistant
3
- Version: 4.273.1
3
+ Version: 4.283.1
4
4
  Summary: 🧠 An AI Assistant.
5
5
  Home-page: https://github.com/kamangir/blue-assistant
6
6
  Author: Arash Abadpour (Kamangir)
@@ -124,4 +124,4 @@ Also home to [`@web`](https://raw.githubusercontent.com/kamangir/blue-assistant/
124
124
 
125
125
  [![pylint](https://github.com/kamangir/blue-assistant/actions/workflows/pylint.yml/badge.svg)](https://github.com/kamangir/blue-assistant/actions/workflows/pylint.yml) [![pytest](https://github.com/kamangir/blue-assistant/actions/workflows/pytest.yml/badge.svg)](https://github.com/kamangir/blue-assistant/actions/workflows/pytest.yml) [![bashtest](https://github.com/kamangir/blue-assistant/actions/workflows/bashtest.yml/badge.svg)](https://github.com/kamangir/blue-assistant/actions/workflows/bashtest.yml) [![PyPI version](https://img.shields.io/pypi/v/blue-assistant.svg)](https://pypi.org/project/blue-assistant/) [![PyPI - Downloads](https://img.shields.io/pypi/dd/blue-assistant)](https://pypistats.org/packages/blue-assistant)
126
126
 
127
- built by 🌀 [`blue_options-4.236.1`](https://github.com/kamangir/awesome-bash-cli), based on 🧠 [`blue_assistant-4.273.1`](https://github.com/kamangir/blue-assistant).
127
+ built by 🌀 [`blue_options-4.240.1`](https://github.com/kamangir/awesome-bash-cli), based on 🧠 [`blue_assistant-4.283.1`](https://github.com/kamangir/blue-assistant).
@@ -1,5 +1,5 @@
1
1
  blue_assistant/README.py,sha256=EJORj3I5pucJplI86lrFaZBN5C9IYNgKoG_V7h27NHw,2586
2
- blue_assistant/__init__.py,sha256=pPnU1NLSA8YbfAPdzhjwL5J0c1GSxruUlXume0IxvIE,311
2
+ blue_assistant/__init__.py,sha256=sPtxnf1X-ZphpOLGOeSuDHB2HL_PYWxMF_E6-7ERMEA,311
3
3
  blue_assistant/__main__.py,sha256=URtal70XZc0--3FDTYWcLtnGOqBYjMX9gt-L1k8hDXI,361
4
4
  blue_assistant/config.env,sha256=npodyuuhkZUHUv9FnEiQQZkKxFbg8nQb1YpOCURqV3Y,301
5
5
  blue_assistant/env.py,sha256=FTSdJ8-J4jAyI0-h3MBgOweQBWd3YEFIibBHSXpClrY,760
@@ -43,7 +43,7 @@ blue_assistant/script/actions/__init__.py,sha256=GJJCaXSrfhwW9K5A2PHmrS7iGl-Ss9U
43
43
  blue_assistant/script/actions/generate_image.py,sha256=brg3u6e-cZvBLK8B7UAh-CXgmAwyvSN-jE00EXMSF3A,1357
44
44
  blue_assistant/script/actions/generate_text.py,sha256=GQ7sF1J-vBNgr-h01RZHMDQJOGDXx8PzQWEVNQwW_TU,1910
45
45
  blue_assistant/script/actions/generic.py,sha256=EjJkDj82ZFFQbjn-Uib-2Qs-nZG8kR2NzhsEVuOpgWQ,376
46
- blue_assistant/script/actions/web_crawl.py,sha256=-EYbpXHbqPzKj-5AiDiXNgYIyUn-qv3mFz6RxhgBovo,1523
46
+ blue_assistant/script/actions/web_crawl.py,sha256=LoZXEKOlSEVVVJaSobMqnOjbpumtScaCta3I8TQbV-A,1532
47
47
  blue_assistant/script/repository/__init__.py,sha256=zVI3cubRqM9H6WgF0EUP9idILVLCumPFmJgKPM7iVlM,604
48
48
  blue_assistant/script/repository/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
49
49
  blue_assistant/script/repository/base/classes.py,sha256=gM8OB1iKRKLa_dk7esAogAR9WaNZFgLhXkpd1iTtsuQ,5376
@@ -62,12 +62,12 @@ blue_assistant/script/repository/hue/colors.py,sha256=rUdtCroNAnzm1zUuVp8eVhvfIi
62
62
  blue_assistant/script/repository/orbital_data_explorer/__init__.py,sha256=yy5FtCeHlr9dRfqxw4QYWr7_yRjnQpwVyuAY2vLrh4Q,110
63
63
  blue_assistant/script/repository/orbital_data_explorer/classes.py,sha256=ixYd_FHWYtp8Sfd6AiZkIqePjoUlT9iLg7TvuxHIDzA,204
64
64
  blue_assistant/web/__init__.py,sha256=70_JSpnfX1mLm8Xv3xHIujfr2FfGeHPRs6HraWDP1XA,114
65
- blue_assistant/web/__main__.py,sha256=35RG-pqWEUcqNz0R59Efw4m00Azc46lu2r-42yX3Xqk,1663
66
- blue_assistant/web/crawl.py,sha256=K9EXWxhKBt6I9eViqSdpcx-z2aGsroafi2cmVOhgbOE,2368
67
- blue_assistant/web/fetch.py,sha256=b3EEMHoi-Tv2r2I5B2AVbDtHqrK0il42hvi2ZnBOKMY,1825
68
- blue_assistant/web/functions.py,sha256=CHGPM8RF-JtZlSL2vE0NFSZJVXMMMrXs8biwFk_JsSA,1042
69
- blue_assistant-4.273.1.dist-info/LICENSE,sha256=ogEPNDSH0_dhiv_lT3ifVIdgIzHAqNA_SemnxUfPBJk,7048
70
- blue_assistant-4.273.1.dist-info/METADATA,sha256=W6oaQcpfGcBWJfqHPU7K3gsZbljDiw82Y3Tn1tCKIrI,5380
71
- blue_assistant-4.273.1.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
72
- blue_assistant-4.273.1.dist-info/top_level.txt,sha256=ud0BkBbdOVze13bNqHuhZj1rwCztaBtDf5ChEYzASOs,15
73
- blue_assistant-4.273.1.dist-info/RECORD,,
65
+ blue_assistant/web/__main__.py,sha256=4s2LNikNiT4UTbzOVQzV4j2jUWfmVIbE36WS1BxTWJY,1576
66
+ blue_assistant/web/crawl.py,sha256=yeo_HJhX8Pp5E1BC7ZGBzNs_c6pvMYSC3olvr5K27hU,3118
67
+ blue_assistant/web/fetch.py,sha256=0hbT246VzpYVCfJ8eflIZWGFMJoxml9vj-sYRCedCH4,2469
68
+ blue_assistant/web/functions.py,sha256=uJAC_kGOn2wA9AwOB_FB2f1dFYyNaEPPC42lN3klnFc,618
69
+ blue_assistant-4.283.1.dist-info/LICENSE,sha256=ogEPNDSH0_dhiv_lT3ifVIdgIzHAqNA_SemnxUfPBJk,7048
70
+ blue_assistant-4.283.1.dist-info/METADATA,sha256=SF7LXKUsui8jj3MOhUDTRA54gt2AT5HQ4y_dIsRuA0Q,5380
71
+ blue_assistant-4.283.1.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
72
+ blue_assistant-4.283.1.dist-info/top_level.txt,sha256=ud0BkBbdOVze13bNqHuhZj1rwCztaBtDf5ChEYzASOs,15
73
+ blue_assistant-4.283.1.dist-info/RECORD,,