py2ls 0.1.4.4__py3-none-any.whl → 0.1.4.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.DS_Store +0 -0
- py2ls/.git/COMMIT_EDITMSG +1 -4
- py2ls/.git/FETCH_HEAD +1 -1
- py2ls/.git/index +0 -0
- py2ls/.git/logs/HEAD +5 -0
- py2ls/.git/logs/refs/heads/main +5 -0
- py2ls/.git/logs/refs/remotes/origin/HEAD +12 -0
- py2ls/.git/logs/refs/remotes/origin/main +4 -0
- py2ls/.git/objects/1a/b4585881a6a42889f01aa0cfe25fd5acfaf46f +0 -0
- py2ls/.git/objects/50/08ddfcf53c02e82d7eee2e57c38e5672ef89f6 +0 -0
- py2ls/.git/objects/53/e0deb1cb4c2c606bced6e7f9a66b0fda60980d +0 -0
- py2ls/.git/objects/56/e4e8b2d5545e0256090f45aa8fc42c5fe067d0 +0 -0
- py2ls/.git/objects/62/7c81b23b4e56e87b042b650b0103653cc9e34a +0 -0
- py2ls/.git/objects/62/d90ccf8cbefdc2e4fd475e7c6f4f76e9fdf801 +3 -0
- py2ls/.git/objects/6a/52e747a2b349b128d1490d9e896d2323818eb7 +0 -0
- py2ls/.git/objects/6c/cebb29b7f3f5b0c889f6dadbf9ff066554587d +0 -0
- py2ls/.git/objects/71/36b2074a2754be8b58127d82250e5b37e3c373 +0 -0
- py2ls/.git/objects/81/8f26b7bf042269729020cf944fc362d66ba27e +0 -0
- py2ls/.git/objects/84/59071b722a255b774a80b27746033f8141ab39 +0 -0
- py2ls/.git/objects/8b/84f56978e1de8f2ae82abce5f8b3e182d365cd +0 -0
- py2ls/.git/objects/b5/61831c7dce8ea51e7ee6b6fa35745f14d8242d +0 -0
- py2ls/.git/objects/c1/20fc812b9ad311c34a3608512d6a9d976bb48e +0 -0
- py2ls/.git/objects/d6/9ab1c4aadf279936dd778e8346ba60f74705b6 +0 -0
- py2ls/.git/objects/d9/dfa5aee51e92a541b707e8e7baea6f06deff98 +0 -0
- py2ls/.git/objects/db/141dbaa93594df2a8156182f361ee4db829359 +0 -0
- py2ls/.git/objects/e3/1356f90ea6dd0577b5e0b40b206319adcbf085 +0 -0
- py2ls/.git/objects/fa/147e6bb78a2e8db241d231295fd7f1ed061af8 +0 -0
- py2ls/.git/refs/heads/main +1 -1
- py2ls/.git/refs/remotes/origin/main +1 -1
- py2ls/__init__.py +1 -2
- py2ls/ips.py +90 -6
- py2ls/netfinder.py +18 -10
- py2ls/setuptools-70.1.0-py3-none-any.whl +0 -0
- {py2ls-0.1.4.4.dist-info → py2ls-0.1.4.6.dist-info}/METADATA +139 -2
- {py2ls-0.1.4.4.dist-info → py2ls-0.1.4.6.dist-info}/RECORD +36 -17
- py2ls/internet_finder.py +0 -405
- py2ls/version.py +0 -1
- {py2ls-0.1.4.4.dist-info → py2ls-0.1.4.6.dist-info}/WHEEL +0 -0
py2ls/.DS_Store
ADDED
Binary file
|
py2ls/.git/COMMIT_EDITMSG
CHANGED
py2ls/.git/FETCH_HEAD
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
c120fc812b9ad311c34a3608512d6a9d976bb48e branch 'main' of https://github.com/Jianfengliu0413/py2ls
|
py2ls/.git/index
CHANGED
Binary file
|
py2ls/.git/logs/HEAD
CHANGED
@@ -8,3 +8,8 @@ d9c2403fd166ce791b4e9d0c6792ed8342c71fcd 14449a0e6ba4ea2f1a73acf63ef91c9c6193f9e
|
|
8
8
|
3bbd972aa7ad680858f8dfbd0f7fcd97756f0d6f 48a88fc5806305d0bb0755ee6801161b79696972 Jianfeng <Jianfeng.Liu0413@gmail.com> 1718526957 +0200 commit: Update netfinder.py
|
9
9
|
48a88fc5806305d0bb0755ee6801161b79696972 cf0c0d9c6fb09473aaeb7f7e2edbd770c3f2ef3d Jianfeng <Jianfeng.Liu0413@gmail.com> 1718553462 +0200 commit: new feature: display_thumbnail_figure
|
10
10
|
cf0c0d9c6fb09473aaeb7f7e2edbd770c3f2ef3d 6427a4edff08f93d98f511418423f09f2ab90bcd Jianfeng <Jianfeng.Liu0413@gmail.com> 1718555179 +0200 commit: listdir type
|
11
|
+
6427a4edff08f93d98f511418423f09f2ab90bcd d9dfa5aee51e92a541b707e8e7baea6f06deff98 Jianfeng <Jianfeng.Liu0413@gmail.com> 1718570098 +0200 commit: imgsets update
|
12
|
+
d9dfa5aee51e92a541b707e8e7baea6f06deff98 e31356f90ea6dd0577b5e0b40b206319adcbf085 Jianfeng <Jianfeng.Liu0413@gmail.com> 1718618413 +0200 commit: add thumbnail
|
13
|
+
e31356f90ea6dd0577b5e0b40b206319adcbf085 1ab4585881a6a42889f01aa0cfe25fd5acfaf46f Jianfeng <Jianfeng.Liu0413@gmail.com> 1718619568 +0200 commit: Update ips.py
|
14
|
+
1ab4585881a6a42889f01aa0cfe25fd5acfaf46f 627c81b23b4e56e87b042b650b0103653cc9e34a Jianfeng <Jianfeng.Liu0413@gmail.com> 1718698925 +0200 commit: update xample_imgsets
|
15
|
+
627c81b23b4e56e87b042b650b0103653cc9e34a c120fc812b9ad311c34a3608512d6a9d976bb48e Jianfeng <Jianfeng.Liu0413@gmail.com> 1718893141 +0200 commit: update
|
py2ls/.git/logs/refs/heads/main
CHANGED
@@ -8,3 +8,8 @@ d9c2403fd166ce791b4e9d0c6792ed8342c71fcd 14449a0e6ba4ea2f1a73acf63ef91c9c6193f9e
|
|
8
8
|
3bbd972aa7ad680858f8dfbd0f7fcd97756f0d6f 48a88fc5806305d0bb0755ee6801161b79696972 Jianfeng <Jianfeng.Liu0413@gmail.com> 1718526957 +0200 commit: Update netfinder.py
|
9
9
|
48a88fc5806305d0bb0755ee6801161b79696972 cf0c0d9c6fb09473aaeb7f7e2edbd770c3f2ef3d Jianfeng <Jianfeng.Liu0413@gmail.com> 1718553462 +0200 commit: new feature: display_thumbnail_figure
|
10
10
|
cf0c0d9c6fb09473aaeb7f7e2edbd770c3f2ef3d 6427a4edff08f93d98f511418423f09f2ab90bcd Jianfeng <Jianfeng.Liu0413@gmail.com> 1718555179 +0200 commit: listdir type
|
11
|
+
6427a4edff08f93d98f511418423f09f2ab90bcd d9dfa5aee51e92a541b707e8e7baea6f06deff98 Jianfeng <Jianfeng.Liu0413@gmail.com> 1718570098 +0200 commit: imgsets update
|
12
|
+
d9dfa5aee51e92a541b707e8e7baea6f06deff98 e31356f90ea6dd0577b5e0b40b206319adcbf085 Jianfeng <Jianfeng.Liu0413@gmail.com> 1718618413 +0200 commit: add thumbnail
|
13
|
+
e31356f90ea6dd0577b5e0b40b206319adcbf085 1ab4585881a6a42889f01aa0cfe25fd5acfaf46f Jianfeng <Jianfeng.Liu0413@gmail.com> 1718619568 +0200 commit: Update ips.py
|
14
|
+
1ab4585881a6a42889f01aa0cfe25fd5acfaf46f 627c81b23b4e56e87b042b650b0103653cc9e34a Jianfeng <Jianfeng.Liu0413@gmail.com> 1718698925 +0200 commit: update xample_imgsets
|
15
|
+
627c81b23b4e56e87b042b650b0103653cc9e34a c120fc812b9ad311c34a3608512d6a9d976bb48e Jianfeng <Jianfeng.Liu0413@gmail.com> 1718893141 +0200 commit: update
|
@@ -24,3 +24,15 @@ d9c2403fd166ce791b4e9d0c6792ed8342c71fcd d9c2403fd166ce791b4e9d0c6792ed8342c71fc
|
|
24
24
|
48a88fc5806305d0bb0755ee6801161b79696972 48a88fc5806305d0bb0755ee6801161b79696972 Jianfeng Liu <macjianfeng@jflmbp.speedport.ip> 1718553371 +0200 remote set-head
|
25
25
|
cf0c0d9c6fb09473aaeb7f7e2edbd770c3f2ef3d cf0c0d9c6fb09473aaeb7f7e2edbd770c3f2ef3d Jianfeng Liu <macjianfeng@jflmbp.speedport.ip> 1718553465 +0200 remote set-head
|
26
26
|
6427a4edff08f93d98f511418423f09f2ab90bcd 6427a4edff08f93d98f511418423f09f2ab90bcd Jianfeng Liu <macjianfeng@jflmbp.speedport.ip> 1718555183 +0200 remote set-head
|
27
|
+
6427a4edff08f93d98f511418423f09f2ab90bcd 6427a4edff08f93d98f511418423f09f2ab90bcd Jianfeng Liu <macjianfeng@jflmbp.speedport.ip> 1718570095 +0200 remote set-head
|
28
|
+
d9dfa5aee51e92a541b707e8e7baea6f06deff98 d9dfa5aee51e92a541b707e8e7baea6f06deff98 Jianfeng Liu <macjianfeng@jflmbp.speedport.ip> 1718570102 +0200 remote set-head
|
29
|
+
e31356f90ea6dd0577b5e0b40b206319adcbf085 e31356f90ea6dd0577b5e0b40b206319adcbf085 Jianfeng Liu <macjianfeng@JFLMBP.local> 1718618416 +0200 remote set-head
|
30
|
+
1ab4585881a6a42889f01aa0cfe25fd5acfaf46f 1ab4585881a6a42889f01aa0cfe25fd5acfaf46f Jianfeng Liu <macjianfeng@JFLMBP.local> 1718619571 +0200 remote set-head
|
31
|
+
1ab4585881a6a42889f01aa0cfe25fd5acfaf46f 1ab4585881a6a42889f01aa0cfe25fd5acfaf46f Jianfeng Liu <macjianfeng@JFLMBP.local> 1718698084 +0200 remote set-head
|
32
|
+
1ab4585881a6a42889f01aa0cfe25fd5acfaf46f 1ab4585881a6a42889f01aa0cfe25fd5acfaf46f Jianfeng Liu <macjianfeng@JFLMBP.local> 1718699917 +0200 remote set-head
|
33
|
+
1ab4585881a6a42889f01aa0cfe25fd5acfaf46f 1ab4585881a6a42889f01aa0cfe25fd5acfaf46f Jianfeng Liu <macjianfeng@JFLMBP.local> 1718701729 +0200 remote set-head
|
34
|
+
1ab4585881a6a42889f01aa0cfe25fd5acfaf46f 1ab4585881a6a42889f01aa0cfe25fd5acfaf46f Jianfeng Liu <macjianfeng@JFLMBP.local> 1718703544 +0200 remote set-head
|
35
|
+
1ab4585881a6a42889f01aa0cfe25fd5acfaf46f 1ab4585881a6a42889f01aa0cfe25fd5acfaf46f Jianfeng Liu <macjianfeng@JFLMBP.local> 1718705358 +0200 remote set-head
|
36
|
+
1ab4585881a6a42889f01aa0cfe25fd5acfaf46f 1ab4585881a6a42889f01aa0cfe25fd5acfaf46f Jianfeng Liu <macjianfeng@JFLMBP.local> 1718707170 +0200 remote set-head
|
37
|
+
1ab4585881a6a42889f01aa0cfe25fd5acfaf46f 1ab4585881a6a42889f01aa0cfe25fd5acfaf46f Jianfeng Liu <macjianfeng@JFLMBP.local> 1718708982 +0200 remote set-head
|
38
|
+
c120fc812b9ad311c34a3608512d6a9d976bb48e c120fc812b9ad311c34a3608512d6a9d976bb48e Jianfeng Liu <macjianfeng@JFLMBP.cin.medizin.uni-tuebingen.de> 1718893145 +0200 remote set-head
|
@@ -8,3 +8,7 @@ d9c2403fd166ce791b4e9d0c6792ed8342c71fcd 14449a0e6ba4ea2f1a73acf63ef91c9c6193f9e
|
|
8
8
|
3bbd972aa7ad680858f8dfbd0f7fcd97756f0d6f 48a88fc5806305d0bb0755ee6801161b79696972 Jianfeng <Jianfeng.Liu0413@gmail.com> 1718526959 +0200 update by push
|
9
9
|
48a88fc5806305d0bb0755ee6801161b79696972 cf0c0d9c6fb09473aaeb7f7e2edbd770c3f2ef3d Jianfeng <Jianfeng.Liu0413@gmail.com> 1718553464 +0200 update by push
|
10
10
|
cf0c0d9c6fb09473aaeb7f7e2edbd770c3f2ef3d 6427a4edff08f93d98f511418423f09f2ab90bcd Jianfeng <Jianfeng.Liu0413@gmail.com> 1718555183 +0200 update by push
|
11
|
+
6427a4edff08f93d98f511418423f09f2ab90bcd d9dfa5aee51e92a541b707e8e7baea6f06deff98 Jianfeng <Jianfeng.Liu0413@gmail.com> 1718570102 +0200 update by push
|
12
|
+
d9dfa5aee51e92a541b707e8e7baea6f06deff98 e31356f90ea6dd0577b5e0b40b206319adcbf085 Jianfeng <Jianfeng.Liu0413@gmail.com> 1718618415 +0200 update by push
|
13
|
+
e31356f90ea6dd0577b5e0b40b206319adcbf085 1ab4585881a6a42889f01aa0cfe25fd5acfaf46f Jianfeng <Jianfeng.Liu0413@gmail.com> 1718619570 +0200 update by push
|
14
|
+
1ab4585881a6a42889f01aa0cfe25fd5acfaf46f c120fc812b9ad311c34a3608512d6a9d976bb48e Jianfeng <Jianfeng.Liu0413@gmail.com> 1718893144 +0200 update by push
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
py2ls/.git/refs/heads/main
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
c120fc812b9ad311c34a3608512d6a9d976bb48e
|
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
c120fc812b9ad311c34a3608512d6a9d976bb48e
|
py2ls/__init__.py
CHANGED
py2ls/ips.py
CHANGED
@@ -3,9 +3,8 @@ import numpy as np
|
|
3
3
|
import pandas as pd
|
4
4
|
import json
|
5
5
|
import matplotlib.pyplot as plt
|
6
|
-
# from functools import partial
|
7
6
|
import seaborn as sns
|
8
|
-
import scienceplots
|
7
|
+
# import scienceplots
|
9
8
|
import matplotlib
|
10
9
|
import sys
|
11
10
|
import os
|
@@ -15,7 +14,7 @@ from scipy import stats
|
|
15
14
|
import matplotlib.ticker as tck
|
16
15
|
from cycler import cycler
|
17
16
|
import re
|
18
|
-
from PIL import ImageEnhance, ImageOps,ImageFilter
|
17
|
+
from PIL import Image,ImageEnhance, ImageOps,ImageFilter
|
19
18
|
from rembg import remove,new_session
|
20
19
|
from mpl_toolkits.mplot3d import Axes3D
|
21
20
|
import docx
|
@@ -37,17 +36,39 @@ from box import Box, BoxList
|
|
37
36
|
from numerizer import numerize
|
38
37
|
from tqdm import tqdm
|
39
38
|
import mimetypes
|
39
|
+
from pprint import pp
|
40
|
+
from dateutil import parser
|
41
|
+
from datetime import datetime
|
42
|
+
|
43
|
+
def str2date(date_str, fmt='%Y-%m-%d_%H:%M:%S'):
|
44
|
+
"""
|
45
|
+
Convert a date string into the specified format.
|
46
|
+
Parameters:
|
47
|
+
- date_str (str): The date string to be converted.
|
48
|
+
- fmt (str): The format to convert the date to. Defaults to '%Y%m%d'.
|
49
|
+
Returns:
|
50
|
+
- str: The converted date string.
|
51
|
+
"""
|
52
|
+
try:
|
53
|
+
date_obj = parser.parse(date_str)
|
54
|
+
except ValueError as e:
|
55
|
+
raise ValueError(f"Unable to parse date string: {date_str}. Error: {e}")
|
56
|
+
# Format the date object to the desired output format
|
57
|
+
formatted_date = date_obj.strftime(fmt)
|
58
|
+
return formatted_date
|
59
|
+
# str1=str2date(num2str(20240625),fmt="%a %d-%B-%Y")
|
60
|
+
# print(str1)
|
61
|
+
# str2=str2num(str2date(str1,fmt='%a %Y%m%d'))
|
62
|
+
# print(str2)
|
40
63
|
|
41
64
|
def str2num(s, *args):
|
42
65
|
delimiter = None
|
43
66
|
round_digits = None
|
44
|
-
|
45
67
|
for arg in args:
|
46
68
|
if isinstance(arg, str):
|
47
69
|
delimiter = arg
|
48
70
|
elif isinstance(arg, int):
|
49
71
|
round_digits = arg
|
50
|
-
|
51
72
|
try:
|
52
73
|
num = int(s)
|
53
74
|
except ValueError:
|
@@ -2107,7 +2128,6 @@ def load_img(fpath):
|
|
2107
2128
|
FileNotFoundError: If the specified file is not found.
|
2108
2129
|
OSError: If the specified file cannot be opened or is not a valid image file.
|
2109
2130
|
"""
|
2110
|
-
from PIL import Image
|
2111
2131
|
|
2112
2132
|
try:
|
2113
2133
|
img = Image.open(fpath)
|
@@ -2188,6 +2208,7 @@ def apply_filter(img, *args):
|
|
2188
2208
|
raise ValueError(
|
2189
2209
|
f"Unsupported filter: {filter_name}, should be one of: {supported_filters}"
|
2190
2210
|
)
|
2211
|
+
|
2191
2212
|
for arg in args:
|
2192
2213
|
if isinstance(arg, str):
|
2193
2214
|
filter_name = arg
|
@@ -2286,6 +2307,29 @@ def imgsets(
|
|
2286
2307
|
Note:
|
2287
2308
|
The "color" and "enhance" enhancements are not implemented in this function.
|
2288
2309
|
"""
|
2310
|
+
supported_filters = [
|
2311
|
+
"BLUR",
|
2312
|
+
"CONTOUR",
|
2313
|
+
"DETAIL",
|
2314
|
+
"EDGE_ENHANCE",
|
2315
|
+
"EDGE_ENHANCE_MORE",
|
2316
|
+
"EMBOSS",
|
2317
|
+
"FIND_EDGES",
|
2318
|
+
"SHARPEN",
|
2319
|
+
"SMOOTH",
|
2320
|
+
"SMOOTH_MORE",
|
2321
|
+
"MIN_FILTER",
|
2322
|
+
"MAX_FILTER",
|
2323
|
+
"MODE_FILTER",
|
2324
|
+
"MULTIBAND_FILTER",
|
2325
|
+
"GAUSSIAN_BLUR",
|
2326
|
+
"BOX_BLUR",
|
2327
|
+
"MEDIAN_FILTER",
|
2328
|
+
]
|
2329
|
+
print("sets: a dict,'sharp:1.2','color','contrast:'auto' or 1.2','bright', 'crop: x_upperleft,y_upperleft, x_lowerright, y_lowerright','rotation','resize','rem or background'")
|
2330
|
+
print(f"usage: filter_kws 'dict' below:")
|
2331
|
+
pp([str(i).lower() for i in supported_filters])
|
2332
|
+
print("\nlog:\n")
|
2289
2333
|
def confirm_rembg_models(model_name):
|
2290
2334
|
models_support = [
|
2291
2335
|
"u2net",
|
@@ -2734,6 +2778,46 @@ def figsets(*args):
|
|
2734
2778
|
plt.tight_layout()
|
2735
2779
|
plt.gcf().align_labels()
|
2736
2780
|
|
2781
|
+
def thumbnail(dir_img_list,figsize=(10,10),dpi=100, dir_save=None, kind='.png'):
|
2782
|
+
"""
|
2783
|
+
Display a thumbnail figure of all images in the specified directory.
|
2784
|
+
Args:
|
2785
|
+
dir_img_list (list): List of the Directory containing the images.
|
2786
|
+
"""
|
2787
|
+
num_images = len(dir_img_list)
|
2788
|
+
if not kind.startswith('.'):
|
2789
|
+
kind='.'+kind
|
2790
|
+
|
2791
|
+
if num_images == 0:
|
2792
|
+
print("No images found to display.")
|
2793
|
+
return
|
2794
|
+
grid_size = int(num_images ** 0.5) + 1 # Determine grid size
|
2795
|
+
fig, axs = plt.subplots(grid_size, grid_size, figsize=figsize,dpi=dpi)
|
2796
|
+
for ax, image_file in zip(axs.flatten(), dir_img_list):
|
2797
|
+
try:
|
2798
|
+
img = Image.open(image_file)
|
2799
|
+
ax.imshow(img)
|
2800
|
+
ax.axis('off')
|
2801
|
+
except:
|
2802
|
+
continue
|
2803
|
+
# for ax in axs.flatten():
|
2804
|
+
# ax.axis('off')
|
2805
|
+
[ax.axis("off") for ax in axs.flatten()]
|
2806
|
+
plt.tight_layout()
|
2807
|
+
if dir_save is None:
|
2808
|
+
plt.show()
|
2809
|
+
else:
|
2810
|
+
if basename(dir_save):
|
2811
|
+
fname= basename(dir_save) +kind
|
2812
|
+
else:
|
2813
|
+
fname= "_thumbnail_"+basename(dirname(dir_save)[:-1])+'.png'
|
2814
|
+
if dirname(dir_img_list[0]) == dirname(dir_save):
|
2815
|
+
figsave(dirname(dir_save[:-1]),fname)
|
2816
|
+
else:
|
2817
|
+
figsave(dirname(dir_save),fname)
|
2818
|
+
# usage:
|
2819
|
+
# fpath = "/Users/macjianfeng/Dropbox/github/python/py2ls/tests/xample_netfinder/images/"
|
2820
|
+
# thumbnail(listdir(fpath,'png').fpath.to_list(),dir_save=dirname(fpath))
|
2737
2821
|
def read_mplstyle(style_file):
|
2738
2822
|
# Load the style file
|
2739
2823
|
plt.style.use(style_file)
|
py2ls/netfinder.py
CHANGED
@@ -21,8 +21,11 @@ from selenium.webdriver.support.ui import WebDriverWait
|
|
21
21
|
from selenium.webdriver.support import expected_conditions as EC
|
22
22
|
from webdriver_manager.chrome import ChromeDriverManager
|
23
23
|
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
24
|
-
import
|
24
|
+
from pprint import pp
|
25
25
|
import mimetypes
|
26
|
+
import io
|
27
|
+
import matplotlib.pyplot as plt
|
28
|
+
from PIL import Image
|
26
29
|
|
27
30
|
# Set up logging
|
28
31
|
logging.basicConfig(level=logging.INFO)
|
@@ -313,7 +316,7 @@ def pdf_detector(url, contains=None, dir_save=None,booster=False):
|
|
313
316
|
pdf_links = filter_links(links=links_all, contains=["pdf"])
|
314
317
|
|
315
318
|
if pdf_links:
|
316
|
-
|
319
|
+
pp(f"pdf detected{pdf_links}")
|
317
320
|
else:
|
318
321
|
print('no pdf file')
|
319
322
|
if dir_save:
|
@@ -399,9 +402,13 @@ def find_img(url, dir_save="images", verbose=True):
|
|
399
402
|
print(f"Failed to process image {image_url}: {e}")
|
400
403
|
print(f"images were saved at\n{dir_save}")
|
401
404
|
if verbose:
|
402
|
-
display_thumbnail_figure(flist(dir_save,filter='img'),dpi=
|
405
|
+
display_thumbnail_figure(flist(dir_save,filter='img'),dpi=100)
|
403
406
|
return content
|
404
|
-
|
407
|
+
def svg_to_png(svg_file):
|
408
|
+
with WandImage(filename=svg_file, resolution=300) as img:
|
409
|
+
img.format = 'png'
|
410
|
+
png_image = img.make_blob()
|
411
|
+
return Image.open(io.BytesIO(png_image))
|
405
412
|
def display_thumbnail_figure(dir_img_list,figsize=(10,10),dpi=100):
|
406
413
|
import matplotlib.pyplot as plt
|
407
414
|
from PIL import Image
|
@@ -422,13 +429,14 @@ def display_thumbnail_figure(dir_img_list,figsize=(10,10),dpi=100):
|
|
422
429
|
fig, axs = plt.subplots(grid_size, grid_size, figsize=figsize,dpi=dpi)
|
423
430
|
|
424
431
|
for ax, image_file in zip(axs.flatten(), dir_img_list):
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
432
|
+
try:
|
433
|
+
img = Image.open(image_file)
|
434
|
+
ax.imshow(img)
|
435
|
+
ax.axis('off') # Hide axes
|
436
|
+
except:
|
437
|
+
continue
|
429
438
|
# Hide remaining subplots
|
430
|
-
for ax in axs.flatten()
|
431
|
-
ax.axis('off')
|
439
|
+
[ax.axis("off") for ax in axs.flatten()]
|
432
440
|
|
433
441
|
plt.tight_layout()
|
434
442
|
plt.show()
|
Binary file
|
@@ -1,12 +1,149 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: py2ls
|
3
|
-
Version: 0.1.4.
|
3
|
+
Version: 0.1.4.6
|
4
4
|
Summary: py(thon)2(too)ls
|
5
5
|
Author: Jianfeng
|
6
6
|
Author-email: Jianfeng.Liu0413@gmail.com
|
7
|
-
Requires-Python: >=3.
|
7
|
+
Requires-Python: >=3.10,<4.0
|
8
8
|
Classifier: Programming Language :: Python :: 3
|
9
|
+
Classifier: Programming Language :: Python :: 3.10
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
9
11
|
Classifier: Programming Language :: Python :: 3.12
|
12
|
+
Requires-Dist: Deprecated (>=1.2.14,<2.0.0)
|
13
|
+
Requires-Dist: Jinja2 (>=3.1.4,<4.0.0)
|
14
|
+
Requires-Dist: MarkupSafe (>=2.1.5,<3.0.0)
|
15
|
+
Requires-Dist: PyMatting (>=1.1.12,<2.0.0)
|
16
|
+
Requires-Dist: PyPDF2 (>=3.0.1,<4.0.0)
|
17
|
+
Requires-Dist: PySocks (>=1.7.1,<2.0.0)
|
18
|
+
Requires-Dist: PyYAML (>=6.0.1,<7.0.0)
|
19
|
+
Requires-Dist: Pygments (>=2.18.0,<3.0.0)
|
20
|
+
Requires-Dist: SciencePlots (>=2.1.1,<3.0.0)
|
21
|
+
Requires-Dist: appnope (>=0.1.4,<0.2.0)
|
22
|
+
Requires-Dist: appscript (>=1.2.5,<2.0.0)
|
23
|
+
Requires-Dist: asttokens (>=2.4.1,<3.0.0)
|
24
|
+
Requires-Dist: attrs (>=23.2.0,<24.0.0)
|
25
|
+
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
26
|
+
Requires-Dist: certifi (>=2024.6.2,<2025.0.0)
|
27
|
+
Requires-Dist: chardet (>=3.0.4,<4.0.0)
|
28
|
+
Requires-Dist: charset-normalizer (>=3.3.2,<4.0.0)
|
29
|
+
Requires-Dist: click (>=8.1.7,<9.0.0)
|
30
|
+
Requires-Dist: colorcet (>=3.1.0,<4.0.0)
|
31
|
+
Requires-Dist: coloredlogs (>=15.0.1,<16.0.0)
|
32
|
+
Requires-Dist: comm (>=0.2.2,<0.3.0)
|
33
|
+
Requires-Dist: contourpy (>=1.2.1,<2.0.0)
|
34
|
+
Requires-Dist: cycler (>=0.12.1,<0.13.0)
|
35
|
+
Requires-Dist: debugpy (>=1.8.1,<2.0.0)
|
36
|
+
Requires-Dist: decorator (>=5.1.1,<6.0.0)
|
37
|
+
Requires-Dist: defusedxml (>=0.7.1,<0.8.0)
|
38
|
+
Requires-Dist: docx (>=0.2.4,<0.3.0)
|
39
|
+
Requires-Dist: docx2pdf (>=0.1.8,<0.2.0)
|
40
|
+
Requires-Dist: executing (>=2.0.1,<3.0.0)
|
41
|
+
Requires-Dist: fake-useragent (>=1.5.1,<2.0.0)
|
42
|
+
Requires-Dist: flatbuffers (>=24.3.25,<25.0.0)
|
43
|
+
Requires-Dist: fonttools (>=4.53.0,<5.0.0)
|
44
|
+
Requires-Dist: fpdf (>=1.7.2,<2.0.0)
|
45
|
+
Requires-Dist: googletrans (>=4.0.0rc1,<5.0.0)
|
46
|
+
Requires-Dist: h11 (>=0.9.0,<0.10.0)
|
47
|
+
Requires-Dist: h2 (>=3.2.0,<4.0.0)
|
48
|
+
Requires-Dist: hpack (>=3.0.0,<4.0.0)
|
49
|
+
Requires-Dist: hstspreload (>=2024.6.1,<2025.0.0)
|
50
|
+
Requires-Dist: httpcore (>=0.9.1,<0.10.0)
|
51
|
+
Requires-Dist: httpx (>=0.13.3,<0.14.0)
|
52
|
+
Requires-Dist: humanfriendly (>=10.0,<11.0)
|
53
|
+
Requires-Dist: hyperframe (>=5.2.0,<6.0.0)
|
54
|
+
Requires-Dist: idna (>=2.10,<3.0)
|
55
|
+
Requires-Dist: imageio (>=2.34.1,<3.0.0)
|
56
|
+
Requires-Dist: img2pdf (>=0.5.1,<0.6.0)
|
57
|
+
Requires-Dist: ipykernel (>=6.29.4,<7.0.0)
|
58
|
+
Requires-Dist: ipython (>=8.25.0,<9.0.0) ; python_version >= "3.9" and python_version < "4.0"
|
59
|
+
Requires-Dist: jedi (>=0.19.1,<0.20.0)
|
60
|
+
Requires-Dist: joblib (>=1.3.2,<2.0.0)
|
61
|
+
Requires-Dist: jsonschema (>=4.22.0,<5.0.0)
|
62
|
+
Requires-Dist: jsonschema-specifications (>=2023.12.1,<2024.0.0)
|
63
|
+
Requires-Dist: jupyter_client (>=8.6.2,<9.0.0)
|
64
|
+
Requires-Dist: jupyter_core (>=5.7.2,<6.0.0)
|
65
|
+
Requires-Dist: kiwisolver (>=1.4.5,<2.0.0)
|
66
|
+
Requires-Dist: langdetect (>=1.0.9,<2.0.0)
|
67
|
+
Requires-Dist: lazy_loader (>=0.4,<0.5)
|
68
|
+
Requires-Dist: libretranslatepy (>=2.1.1,<3.0.0)
|
69
|
+
Requires-Dist: llvmlite (>=0.42.0,<0.43.0)
|
70
|
+
Requires-Dist: lxml (>=4.9.4,<5.0.0)
|
71
|
+
Requires-Dist: matplotlib (>=3.8.4,<4.0.0)
|
72
|
+
Requires-Dist: matplotlib-inline (>=0.1.7,<0.2.0)
|
73
|
+
Requires-Dist: mne (>=1.6.0,<2.0.0)
|
74
|
+
Requires-Dist: mpmath (>=1.3.0,<2.0.0)
|
75
|
+
Requires-Dist: nest-asyncio (>=1.6.0,<2.0.0)
|
76
|
+
Requires-Dist: networkx (>=3.3,<4.0) ; python_version >= "3.10" and python_version < "4.0"
|
77
|
+
Requires-Dist: nltk (>=3.8.1,<4.0.0)
|
78
|
+
Requires-Dist: numba (>=0.59.1,<0.60.0)
|
79
|
+
Requires-Dist: numerizer (>=0.2.3,<0.3.0)
|
80
|
+
Requires-Dist: numpy (>=1.26.4,<2.0.0)
|
81
|
+
Requires-Dist: onnxruntime (>=1.18.0,<2.0.0)
|
82
|
+
Requires-Dist: opencv-contrib-python (>=4.9.0.80,<5.0.0.0)
|
83
|
+
Requires-Dist: opencv-python (>=4.9.0.80,<5.0.0.0)
|
84
|
+
Requires-Dist: opencv-python-headless (>=4.9.0.80,<5.0.0.0)
|
85
|
+
Requires-Dist: outcome (>=1.3.0.post0,<2.0.0)
|
86
|
+
Requires-Dist: packaging (>=24.1,<25.0)
|
87
|
+
Requires-Dist: pandas (>=2.2.2,<3.0.0)
|
88
|
+
Requires-Dist: pandas-flavor (>=0.6.0,<0.7.0)
|
89
|
+
Requires-Dist: parso (>=0.8.4,<0.9.0)
|
90
|
+
Requires-Dist: patsy (>=0.5.6,<0.6.0)
|
91
|
+
Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
|
92
|
+
Requires-Dist: pdf2img (>=0.1.2,<0.2.0)
|
93
|
+
Requires-Dist: pexpect (>=4.9.0,<5.0.0)
|
94
|
+
Requires-Dist: pikepdf (>=9.0.0,<10.0.0)
|
95
|
+
Requires-Dist: pillow (>=10.3.0,<11.0.0)
|
96
|
+
Requires-Dist: pingouin (>=0.5.4,<0.6.0)
|
97
|
+
Requires-Dist: platformdirs (>=4.2.2,<5.0.0)
|
98
|
+
Requires-Dist: pooch (>=1.8.2,<2.0.0)
|
99
|
+
Requires-Dist: prompt_toolkit (>=3.0.47,<4.0.0)
|
100
|
+
Requires-Dist: protobuf (>=5.27.1,<6.0.0)
|
101
|
+
Requires-Dist: psutil (>=5.9.8,<6.0.0)
|
102
|
+
Requires-Dist: ptyprocess (>=0.7.0,<0.8.0)
|
103
|
+
Requires-Dist: pure-eval (>=0.2.2,<0.3.0)
|
104
|
+
Requires-Dist: pyparsing (>=3.1.2,<4.0.0)
|
105
|
+
Requires-Dist: python-box (>=7.2.0,<8.0.0)
|
106
|
+
Requires-Dist: python-dateutil (>=2.9.0.post0,<3.0.0)
|
107
|
+
Requires-Dist: python-docx (>=1.1.0,<2.0.0)
|
108
|
+
Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
|
109
|
+
Requires-Dist: pytz (>=2024.1,<2025.0)
|
110
|
+
Requires-Dist: pyzmq (>=26.0.3,<27.0.0)
|
111
|
+
Requires-Dist: referencing (>=0.35.1,<0.36.0)
|
112
|
+
Requires-Dist: regex (>=2024.5.15,<2025.0.0)
|
113
|
+
Requires-Dist: rembg (>=2.0.56,<3.0.0) ; python_version >= "3.9" and python_version < "3.13"
|
114
|
+
Requires-Dist: requests (>=2.32.3,<3.0.0)
|
115
|
+
Requires-Dist: rfc3986 (>=1.5.0,<2.0.0)
|
116
|
+
Requires-Dist: rpds-py (>=0.18.1,<0.19.0)
|
117
|
+
Requires-Dist: scikit-image (>=0.23.2,<0.24.0) ; python_version >= "3.10" and python_version < "4.0"
|
118
|
+
Requires-Dist: scikit-learn (>=1.3.2,<2.0.0)
|
119
|
+
Requires-Dist: scipy (>=1.13.1,<2.0.0)
|
120
|
+
Requires-Dist: seaborn (>=0.13.2,<0.14.0)
|
121
|
+
Requires-Dist: selenium (>=4.21.0,<5.0.0)
|
122
|
+
Requires-Dist: six (>=1.16.0,<2.0.0)
|
123
|
+
Requires-Dist: sniffio (>=1.3.1,<2.0.0)
|
124
|
+
Requires-Dist: sortedcontainers (>=2.4.0,<3.0.0)
|
125
|
+
Requires-Dist: soupsieve (>=2.5,<3.0)
|
126
|
+
Requires-Dist: stack-data (>=0.6.3,<0.7.0)
|
127
|
+
Requires-Dist: statsmodels (>=0.14.1,<0.15.0)
|
128
|
+
Requires-Dist: stem (>=1.8.2,<2.0.0)
|
129
|
+
Requires-Dist: sympy (>=1.12.1,<2.0.0)
|
130
|
+
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
131
|
+
Requires-Dist: threadpoolctl (>=3.5.0,<4.0.0)
|
132
|
+
Requires-Dist: tifffile (>=2024.5.22,<2025.0.0)
|
133
|
+
Requires-Dist: tornado (>=6.4.1,<7.0.0)
|
134
|
+
Requires-Dist: tqdm (>=4.66.4,<5.0.0)
|
135
|
+
Requires-Dist: traitlets (>=5.14.3,<6.0.0)
|
136
|
+
Requires-Dist: translate (>=3.6.1,<4.0.0)
|
137
|
+
Requires-Dist: trio (>=0.25.1,<0.26.0)
|
138
|
+
Requires-Dist: trio-websocket (>=0.11.1,<0.12.0)
|
139
|
+
Requires-Dist: typing_extensions (>=4.12.2,<5.0.0)
|
140
|
+
Requires-Dist: tzdata (>=2024.1,<2025.0)
|
141
|
+
Requires-Dist: urllib3 (>=2.2.1,<3.0.0)
|
142
|
+
Requires-Dist: wcwidth (>=0.2.13,<0.3.0)
|
143
|
+
Requires-Dist: webdriver-manager (>=4.0.1,<5.0.0)
|
144
|
+
Requires-Dist: wrapt (>=1.16.0,<2.0.0)
|
145
|
+
Requires-Dist: wsproto (>=1.2.0,<2.0.0)
|
146
|
+
Requires-Dist: xarray (>=2024.6.0,<2025.0.0)
|
10
147
|
Description-Content-Type: text/markdown
|
11
148
|
|
12
149
|
# Install
|
@@ -1,5 +1,6 @@
|
|
1
|
-
py2ls/.
|
2
|
-
py2ls/.git/
|
1
|
+
py2ls/.DS_Store,sha256=1lFlJ5EFymdzGAUAaI30vcaaLHt3F1LwpG7xILf9jsM,6148
|
2
|
+
py2ls/.git/COMMIT_EDITMSG,sha256=5xj-jWMbrdOc9m7gSn-UcsAQ9FMNvWSbLWSsrOUIO5w,7
|
3
|
+
py2ls/.git/FETCH_HEAD,sha256=1FfG9FtKEzbthC4ygl5ci0pnEm7ZaF3ZY7njNqkjz2I,100
|
3
4
|
py2ls/.git/HEAD,sha256=KNJb-Cr0wOK3L1CVmyvrhZ4-YLljCl6MYD2tTdsrboA,21
|
4
5
|
py2ls/.git/config,sha256=XswTg1Ts7_7IBDlKHh4OF_0Tq7v4wW7BXb6xSVInSec,345
|
5
6
|
py2ls/.git/description,sha256=ZzMxc0Ca26m45Twn1DDnOHqin5VHEZ9uOTBrScIXSjE,16
|
@@ -16,15 +17,16 @@ py2ls/.git/hooks/pre-receive.sample,sha256=pMPSuce7P9jRRBwxvU7nGlldZrRPz0ndsxAlI
|
|
16
17
|
py2ls/.git/hooks/prepare-commit-msg.sample,sha256=6d3KpBif3dJe2X_Ix4nsp7bKFjkLI5KuMnbwyOGqRhk,1492
|
17
18
|
py2ls/.git/hooks/push-to-checkout.sample,sha256=pT0HQXmLKHxt16-mSu5HPzBeZdP0lGO7nXQI7DsSv18,2783
|
18
19
|
py2ls/.git/hooks/update.sample,sha256=jV8vqD4QPPCLV-qmdSHfkZT0XL28s32lKtWGCXoU0QY,3650
|
19
|
-
py2ls/.git/index,sha256=
|
20
|
+
py2ls/.git/index,sha256=XcsDiczPb7unmufhwqWxcgRV2ye2gknaLxmx1rsiyY4,1346
|
20
21
|
py2ls/.git/info/exclude,sha256=ZnH-g7egfIky7okWTR8nk7IxgFjri5jcXAbuClo7DsE,240
|
21
|
-
py2ls/.git/logs/HEAD,sha256=
|
22
|
-
py2ls/.git/logs/refs/heads/main,sha256=
|
23
|
-
py2ls/.git/logs/refs/remotes/origin/HEAD,sha256=
|
24
|
-
py2ls/.git/logs/refs/remotes/origin/main,sha256=
|
22
|
+
py2ls/.git/logs/HEAD,sha256=ZbfxG7S0VtQedb4Z-bCwPRW4PV1fONzgYIQ0VaZfxvE,2435
|
23
|
+
py2ls/.git/logs/refs/heads/main,sha256=ZbfxG7S0VtQedb4Z-bCwPRW4PV1fONzgYIQ0VaZfxvE,2435
|
24
|
+
py2ls/.git/logs/refs/remotes/origin/HEAD,sha256=kaJ5UZPeGXVtmL3wpB90jgENFdS20YNyknYovGBS-Gg,6109
|
25
|
+
py2ls/.git/logs/refs/remotes/origin/main,sha256=tgL6shcaeZwXHsJo_YIMa_jmUz0_7nRnJ-Z4lLH_bdk,2128
|
25
26
|
py2ls/.git/objects/0b/409e1bc918277010f5679b402d1d1dda53e15c,sha256=y5S1XaGxJz1NXi-SPWjPC_NKIqqSbZv9oOg74MzBihY,156
|
26
27
|
py2ls/.git/objects/14/449a0e6ba4ea2f1a73acf63ef91c9c6193f9ed,sha256=PomZFmCUCQM1ii0wH-OJGSHLQCTqRtIwE5w3C0TtzSY,171
|
27
28
|
py2ls/.git/objects/15/a8e468aacfcb440e090020f36d0b985d45da23,sha256=xiRunMcN5I_B2hHgBUFupR-F0b8H_CQTmmAZG9XkZik,3215
|
29
|
+
py2ls/.git/objects/1a/b4585881a6a42889f01aa0cfe25fd5acfaf46f,sha256=iQsKMPNKUs4WQwhiLgXmG5V3xKyIgxmc13ZwbBATvhQ,165
|
28
30
|
py2ls/.git/objects/1d/fe9d9633b24ea560354f4f93d39c6e5f163ea0,sha256=mV_84wLqIitnSYmzfrNpTzwVP9AmksiRI0Fjltwl0Pg,8872
|
29
31
|
py2ls/.git/objects/24/6b368b986f758630c46dc02b7fa512b53422f7,sha256=sw7ERFCFu7m6fnURAqQfQ4GWShaARr-Vc6GRnlOPkxU,8512
|
30
32
|
py2ls/.git/objects/25/b796accd261b9135fd32a2c00785f68edf6c46,sha256=4ic5vOwEdfbGL8oARSVEeAnSoDs14-gggGZEL-61nYE,564
|
@@ -37,16 +39,27 @@ py2ls/.git/objects/3c/bbe5f4173d165127b9ad96119f1ec24c306ffc,sha256=S1BXemROYtzR
|
|
37
39
|
py2ls/.git/objects/3f/d6561300938afbb3d11976cf9c8f29549280d9,sha256=91oqbTWfUE1d_hT_1ptYmRUb5pOQ1X4oxQxpF6NXjKU,8501
|
38
40
|
py2ls/.git/objects/43/dbd49b2ee367c5434dd545e3b5795434f2ef0b,sha256=DAzt0dWp2KsuuImCKp7N9ia7KaCDNqwB-tYIx3Wf_c0,565
|
39
41
|
py2ls/.git/objects/48/a88fc5806305d0bb0755ee6801161b79696972,sha256=f3JStE39k_hPGE-WRwqZtDTjQkfOmBVb_6-ELBbScjI,203
|
42
|
+
py2ls/.git/objects/50/08ddfcf53c02e82d7eee2e57c38e5672ef89f6,sha256=p0M2WLqiTe6X2FI_k5Aj0IEsE85jqLa58sVdmV8x1vU,255
|
43
|
+
py2ls/.git/objects/53/e0deb1cb4c2c606bced6e7f9a66b0fda60980d,sha256=muq6m7_XRSFPzypW-m9mhpKfsomCr4s7GfkgM3gh2pc,482344
|
44
|
+
py2ls/.git/objects/56/e4e8b2d5545e0256090f45aa8fc42c5fe067d0,sha256=VsjKo1biAzCV-iIfwCDTPzyfP63K43hdZqJpDP70Iik,529
|
40
45
|
py2ls/.git/objects/58/20a729045d4dc7e37ccaf8aa8eec126850afe2,sha256=3Pf6NS8OTK4EdHZGVeJ421BtK7w4WJncQDBauZI_wW4,34
|
41
46
|
py2ls/.git/objects/60/f273eb1c412d916fa3f11318a7da7a9911b52a,sha256=aJD9iF_LmYSrqDepXFBZKN1yMYbQczVkN_wnrDosBdI,5620
|
42
47
|
py2ls/.git/objects/61/570cec8c061abe74121f27f5face6c69b98f99,sha256=IQZi5MkbRu3ToRUPsRcXuh1Xa3pkAz_HDRCVhNL89ds,5753
|
48
|
+
py2ls/.git/objects/62/7c81b23b4e56e87b042b650b0103653cc9e34a,sha256=pv9wgBxnvJUFSrk9G7vApA6lnSykQSMJ4yXT7YnlSDU,167
|
49
|
+
py2ls/.git/objects/62/d90ccf8cbefdc2e4fd475e7c6f4f76e9fdf801,sha256=1L473QanNpnumCkE8tG6wtbvLqFtNeoagL9SJmasXNY,155
|
43
50
|
py2ls/.git/objects/64/27a4edff08f93d98f511418423f09f2ab90bcd,sha256=RyNngwk9fvdvvvywmNfllnim718fWNjVauH9U2y8Q2s,258
|
44
51
|
py2ls/.git/objects/69/13c452ca319f7cbf6a0836dc10a5bb033c84e4,sha256=NYLQQZTfd0htZst42ALS2dmryv1q_l1N19ZfHEbz_38,3193
|
52
|
+
py2ls/.git/objects/6a/52e747a2b349b128d1490d9e896d2323818eb7,sha256=Qc_B3_xxlWmjooFu274r82b583uf_HpIpDBldr9fqVI,34966
|
45
53
|
py2ls/.git/objects/6b/7fde264d93a7a0986d394c46c7650d0ce2ab92,sha256=iIl0-RF0wd6BSEjzczgUyApxc899PbdTl04JbDn6_-Q,166
|
54
|
+
py2ls/.git/objects/6c/cebb29b7f3f5b0c889f6dadbf9ff066554587d,sha256=UylkFWAfhStNVQRQuC9CzpaWaT9uHCVs1mn7ecOma8I,609
|
55
|
+
py2ls/.git/objects/71/36b2074a2754be8b58127d82250e5b37e3c373,sha256=cbVFQaBx0Q5QkZ1wQle-iIxNx14JxGSx3G8aQ7EbbAA,586
|
46
56
|
py2ls/.git/objects/78/3d4167bc95c9d2175e0df03ef1c1c880ba75ab,sha256=SK2QDjDBiDhVMG1I5p19g4RbEm2Rax7mYnxawmVZYxs,15523
|
47
57
|
py2ls/.git/objects/79/7ae089b2212a937840e215276005ce76881307,sha256=lQOKF2pb1JvipI3eT79X0-TuMGWsy1A-Yw4BCgKZNOM,33472
|
48
58
|
py2ls/.git/objects/7e/5956c806b5edc344d46dab599dec337891ba1f,sha256=sfqJBiSNj-gyJo4D7xkmRAo76mC2ztjqeZZsl4ifULA,162
|
59
|
+
py2ls/.git/objects/81/8f26b7bf042269729020cf944fc362d66ba27e,sha256=mg6FGEyv6EcOgurR8CEvHGovaWrUgMUxTtACAy7-ei4,34960
|
60
|
+
py2ls/.git/objects/84/59071b722a255b774a80b27746033f8141ab39,sha256=0pYGJOXFfp4MSu4n5MzE1XN--t0lSs7wcdqboADWMx0,9792
|
49
61
|
py2ls/.git/objects/87/ef1fc3f7f1ddc4d0ab9b3e65381ce9f3388621,sha256=OFrpW6lu31qGBvD3ijPUBSG9JrdU1_mKzeYBzidn9VM,3748
|
62
|
+
py2ls/.git/objects/8b/84f56978e1de8f2ae82abce5f8b3e182d365cd,sha256=a8XequnUMBSv9zIQJdcdgDvMQ7PLGdIrgZ-MqQGF87c,573
|
50
63
|
py2ls/.git/objects/8e/55a7d2b96184030211f20c9b9af201eefcac82,sha256=yW-jVYeCTWR-nX3JJgA1g9YLPjzNsKlDmEOH290Ywx0,1221
|
51
64
|
py2ls/.git/objects/91/c69ad88fe0ba94aa7859fb5f7edac5e6f1a3f7,sha256=Kk2MWCO1OcShYuABGzp2O9LiWGDfDkcZtd0oy4nY6RU,9529
|
52
65
|
py2ls/.git/objects/9d/0df52899fe95279059286d9c0ec42287edc168,sha256=67nV3TLo-fwe4lt0wwvxoDnVNHc1IpapRyAY2STP3iI,564
|
@@ -55,38 +68,44 @@ py2ls/.git/objects/a7/3e13eafee65c5b8d73ad2d3ea46d0eee82f0d3,sha256=iv3uTzna5XBz
|
|
55
68
|
py2ls/.git/objects/b0/56be4be89ba6b76949dd641df45bb7036050c8,sha256=8Y7z30eNceBd5QIx09QfMp5cYBbrgUllmats0kvJEJ4,132
|
56
69
|
py2ls/.git/objects/b0/9cd7856d58590578ee1a4f3ad45d1310a97f87,sha256=82dx4hIdMpdcB64e5PU1s2gZFVkTvrj1cPwwJ_kasNU,4444
|
57
70
|
py2ls/.git/objects/b2/18e6a0f0f1c4df8cdefa9852058348abc713b7,sha256=hOQfdyzDZctjoge0-pAcEDel5XHVPNfOtrMNyFPUOIE,564
|
71
|
+
py2ls/.git/objects/b5/61831c7dce8ea51e7ee6b6fa35745f14d8242d,sha256=wUqxlKjLN1vOUj2tkYStado64QewdcF3CHlSICds1ik,34415
|
58
72
|
py2ls/.git/objects/bb/934eb33bc1a8b85630bf680caffd99560c1b8f,sha256=ggehjexUsWlskHJvHxW7u6U0otB0OCItmIZdT9O-3OU,9670
|
73
|
+
py2ls/.git/objects/c1/20fc812b9ad311c34a3608512d6a9d976bb48e,sha256=q-WAKugB-_-g7w0Mlw6oyTBaXQ_Qd7BdLatrDiYN7Wc,156
|
59
74
|
py2ls/.git/objects/c4/cba65f1163661999ee4b8ed23342b63bc1300c,sha256=rwSdKt-C98nUQ_B-7imY4fYRYmn29MQc4SIu9wruHeo,566
|
60
75
|
py2ls/.git/objects/c6/7f17e5707313600efcb85e9a3fedea35dba591,sha256=TL7rDIWiaWlk8iIwqPst7St5Xr2otPs-vp17GPlET7o,565
|
61
76
|
py2ls/.git/objects/cf/0c0d9c6fb09473aaeb7f7e2edbd770c3f2ef3d,sha256=T_nV0GrgpVu3mOJ4fYcCW98oCunzgqy0DnSX0luy04Q,183
|
77
|
+
py2ls/.git/objects/d6/9ab1c4aadf279936dd778e8346ba60f74705b6,sha256=WcfdSMKqfiWT5TOWVUcDj0XDaD2hYxDnyIRNlYGutL8,34976
|
62
78
|
py2ls/.git/objects/d9/005f2cc7fc4e65f14ed5518276007c08cf2fd0,sha256=IJIoz93V7pf9yx43U1JdN8gBq_LWtw8A9Z2YMPnq_B0,1450
|
63
79
|
py2ls/.git/objects/d9/c2403fd166ce791b4e9d0c6792ed8342c71fcd,sha256=uD7BsKdrmN-9FStTpwsRWh-XxVXeDsV4dGjFkaMIIs8,170
|
80
|
+
py2ls/.git/objects/d9/dfa5aee51e92a541b707e8e7baea6f06deff98,sha256=jMdhZ1i_L5q_UgjOtjLN15PCSCz3pE51FhD3z74ZUr8,163
|
81
|
+
py2ls/.git/objects/db/141dbaa93594df2a8156182f361ee4db829359,sha256=TpKTLvbDc4Blzrp1Pq9JijqDROJyBJ7sCQQBmIuYKZo,845984
|
64
82
|
py2ls/.git/objects/db/ffa8ea7bda721d0cee7b9e4ce5b2ef927733ff,sha256=GhDkvP6JYV26qVg5ETPys1ZEnGlsct9hiXCc24Ky4Xg,565
|
65
83
|
py2ls/.git/objects/df/e0770424b2a19faf507a501ebfc23be8f54e7b,sha256=vCdlxwEidekh8i-5TVMVgSLGk9DPZCZAbWqvGYSKQ9c,76
|
84
|
+
py2ls/.git/objects/e3/1356f90ea6dd0577b5e0b40b206319adcbf085,sha256=I9_QNwmmtoqSwq29Ixdfv_PgF2x14u2M6sX1eQumwoY,161
|
66
85
|
py2ls/.git/objects/e3/5a4dafc50850cacac7bf76c56db2715cbda2c4,sha256=GAcBj3YSEbm6tm7fGD6al16uBo8LtEtjZ2Hi-UgIsUg,3290
|
67
86
|
py2ls/.git/objects/e9/391ffe371f1cc43b42ef09b705d9c767c2e14f,sha256=RWTy2n8L2XxZQknBFyPczA0Aa_4gSG_Ybcr8e8v4ccc,10264
|
68
87
|
py2ls/.git/objects/f4/b64d3107b39e3ad6f540c6607004ea34e6c024,sha256=0egAtqc0x8hc7U1z91tIjcRhSd_BT2a_gxZxo_7NTJA,564
|
69
88
|
py2ls/.git/objects/f7/c98ba5c2f903e603b1f5e63d49fbc8a43815cc,sha256=tYbi3A7irrIPB_11bwItuof0Vc9a0MDuLFMNAzRsG3A,33467
|
89
|
+
py2ls/.git/objects/fa/147e6bb78a2e8db241d231295fd7f1ed061af8,sha256=G9pg5LXv7AdxnPIQsTm2AF3Un314dLRJQYwxmZem9rQ,574
|
70
90
|
py2ls/.git/objects/fc/292e793ecfd42240ac43be407023bd731fa9e7,sha256=hGIYoxKWNT3IPwk3DE4l3FLBbUYF-kXcHcx7KrH9uS0,1971
|
71
|
-
py2ls/.git/refs/heads/main,sha256=
|
91
|
+
py2ls/.git/refs/heads/main,sha256=CKZwTZ8cZZy9HnCOINHmltX6O90E8kPZFdJQ9peSpMk,41
|
72
92
|
py2ls/.git/refs/remotes/origin/HEAD,sha256=K7aiSqD8bEhBAPXVGim7rYQc0sdV9dk_qiBOXbtOsrQ,30
|
73
|
-
py2ls/.git/refs/remotes/origin/main,sha256=
|
93
|
+
py2ls/.git/refs/remotes/origin/main,sha256=CKZwTZ8cZZy9HnCOINHmltX6O90E8kPZFdJQ9peSpMk,41
|
74
94
|
py2ls/.gitattributes,sha256=Gh2-F2vCM7SZ01pX23UT8pQcmauXWfF3gwyRSb6ZAFs,66
|
75
95
|
py2ls/.gitignore,sha256=y7GvbD_zZkjPVVIue8AyiuFkDMuUbvMaV65Lgu89To8,2763
|
76
96
|
py2ls/LICENSE,sha256=UOZ1F5fFDe3XXvG4oNnkL1-Ecun7zpHzRxjp-XsMeAo,11324
|
77
97
|
py2ls/README.md,sha256=CwvJWAnSXnCnrVHlnEbrxxi6MbjbE_MT6DH2D53S818,11572
|
78
|
-
py2ls/__init__.py,sha256=
|
98
|
+
py2ls/__init__.py,sha256=ESXjQ9tnqg5mqYH4Gfgs76AoT1HHF_BkJUgnstiVwR8,243
|
79
99
|
py2ls/brain_atlas.py,sha256=w1o5EelRjq89zuFJUNSz4Da8HnTCwAwDAZ4NU4a-bAY,5486
|
80
100
|
py2ls/correlators.py,sha256=RbOaJIPLCHJtUm5SFi_4dCJ7VFUPWR0PErfK3K26ad4,18243
|
81
101
|
py2ls/dbhandler.py,sha256=i9dNrpHyx0oIaFieHI4X4tsrCdN-aFxudPTDOgy9Ppo,3574
|
82
102
|
py2ls/freqanalysis.py,sha256=F4218VSPbgL5tnngh6xNCYuNnfR-F_QjECUUxrPYZss,32594
|
83
|
-
py2ls/
|
84
|
-
py2ls/
|
85
|
-
py2ls/
|
103
|
+
py2ls/ips.py,sha256=wcA7UITz2Nx5bmDkQvGyZ9mNCvt9ZE9JTRpgCvExNPs,124868
|
104
|
+
py2ls/netfinder.py,sha256=dt6hkYeH-ivCHInoUi92MhJMLlXtjRXT3ewKzOwGtWk,31506
|
105
|
+
py2ls/setuptools-70.1.0-py3-none-any.whl,sha256=2bi3cUVal8ip86s0SOvgspteEF8SKLukECi-EWmFomc,882588
|
86
106
|
py2ls/sleep_events_detectors.py,sha256=36MCuRrpurn0Uvzpo3p3b3_JlVsRNHSWCXbJxCGM3mg,51546
|
87
107
|
py2ls/translator.py,sha256=QfDUO0-pXHGMBFZBefiBHzOrC93-__N5sUQY_VP4wes,29734
|
88
|
-
py2ls/version.py,sha256=CactNZqrHHYTPrkHKccy2WKXmaiUdtTgPqSjFyVXnJk,18
|
89
108
|
py2ls/wb_detector.py,sha256=7y6TmBUj9exCZeIgBAJ_9hwuhkDh1x_-yg4dvNY1_GQ,6284
|
90
|
-
py2ls-0.1.4.
|
91
|
-
py2ls-0.1.4.
|
92
|
-
py2ls-0.1.4.
|
109
|
+
py2ls-0.1.4.6.dist-info/METADATA,sha256=M9tLANmcFhRhKeppFawPAZ4tOTn7lrFw99JBp0Mso2A,17943
|
110
|
+
py2ls-0.1.4.6.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
111
|
+
py2ls-0.1.4.6.dist-info/RECORD,,
|
py2ls/internet_finder.py
DELETED
@@ -1,405 +0,0 @@
|
|
1
|
-
from bs4 import BeautifulSoup
|
2
|
-
import requests
|
3
|
-
import os
|
4
|
-
from urllib.parse import urlparse, urljoin
|
5
|
-
import base64
|
6
|
-
import pandas as pd
|
7
|
-
from collections import Counter
|
8
|
-
import random
|
9
|
-
import logging
|
10
|
-
from time import sleep
|
11
|
-
import stem.process
|
12
|
-
from stem import Signal
|
13
|
-
from stem.control import Controller
|
14
|
-
import json
|
15
|
-
# Set up logging
|
16
|
-
logging.basicConfig(level=logging.INFO)
|
17
|
-
logger = logging.getLogger(__name__)
|
18
|
-
|
19
|
-
# Define supported content types and corresponding parsers
|
20
|
-
CONTENT_PARSERS = {
|
21
|
-
"text/html": lambda text, parser: BeautifulSoup(text, parser),
|
22
|
-
"application/json": lambda text, parser: json.loads(text),
|
23
|
-
"text/xml": lambda text, parser: BeautifulSoup(text, parser),
|
24
|
-
"text/plain": lambda text, parser: text.text,
|
25
|
-
}
|
26
|
-
|
27
|
-
def fetch_all(url, parser="lxml"): # lxml is faster, # parser="html.parser"
|
28
|
-
try:
|
29
|
-
# Generate a random user-agent string
|
30
|
-
headers = {"User-Agent": user_agent()}
|
31
|
-
|
32
|
-
# Send the initial request
|
33
|
-
response = requests.get(url, headers=headers)
|
34
|
-
|
35
|
-
# If the response is a redirect, follow it
|
36
|
-
while response.is_redirect:
|
37
|
-
logger.info(f"Redirecting to: {response.headers['Location']}")
|
38
|
-
response = requests.get(response.headers["Location"], headers=headers)
|
39
|
-
# Check for a 403 error
|
40
|
-
if response.status_code == 403:
|
41
|
-
logger.warning("403 Forbidden error. Retrying...")
|
42
|
-
# Retry the request after a short delay
|
43
|
-
sleep(random.uniform(1, 3))
|
44
|
-
response = requests.get(url, headers=headers)
|
45
|
-
# Raise an error if retry also fails
|
46
|
-
response.raise_for_status()
|
47
|
-
|
48
|
-
# Raise an error for other HTTP status codes
|
49
|
-
response.raise_for_status()
|
50
|
-
|
51
|
-
# Get the content type
|
52
|
-
content_type = response.headers.get("content-type", "").split(";")[0].lower()
|
53
|
-
content = response.content.decode(response.encoding)
|
54
|
-
# logger.info(f"Content type: {content_type}")
|
55
|
-
|
56
|
-
# Check if content type is supported
|
57
|
-
if content_type in CONTENT_PARSERS:
|
58
|
-
return content_type, CONTENT_PARSERS[content_type](content, parser)
|
59
|
-
else:
|
60
|
-
logger.warning("Unsupported content type")
|
61
|
-
return None, None
|
62
|
-
except requests.RequestException as e:
|
63
|
-
logger.error(f"Error fetching URL '{url}': {e}")
|
64
|
-
return None, None
|
65
|
-
def user_agent():
|
66
|
-
# Example of generating a random user-agent string
|
67
|
-
user_agents = [
|
68
|
-
# Windows (Intel)
|
69
|
-
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4891.0 Safari/537.36",
|
70
|
-
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4893.0 Safari/537.36",
|
71
|
-
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4895.0 Safari/537.36",
|
72
|
-
# Windows (ARM)
|
73
|
-
"Mozilla/5.0 (Windows NT 10.0; Win64; arm64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4891.0 Safari/537.36",
|
74
|
-
"Mozilla/5.0 (Windows NT 10.0; Win64; arm64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4893.0 Safari/537.36",
|
75
|
-
"Mozilla/5.0 (Windows NT 10.0; Win64; arm64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4895.0 Safari/537.36",
|
76
|
-
# Linux (x86_64)
|
77
|
-
"Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0",
|
78
|
-
"Mozilla/5.0 (X11; Linux x86_64; rv:99.0) Gecko/20100101 Firefox/99.0",
|
79
|
-
"Mozilla/5.0 (X11; Linux x86_64; rv:100.0) Gecko/20100101 Firefox/100.0",
|
80
|
-
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36",
|
81
|
-
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4891.0 Safari/537.36",
|
82
|
-
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4893.0 Safari/537.36",
|
83
|
-
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4895.0 Safari/537.36",
|
84
|
-
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0",
|
85
|
-
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:99.0) Gecko/20100101 Firefox/99.0",
|
86
|
-
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:100.0) Gecko/20100101 Firefox/100.0",
|
87
|
-
# macOS (Intel)
|
88
|
-
"Mozilla/5.0 (Macintosh; Intel Mac OS X 12_0_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.3 Safari/605.1.15",
|
89
|
-
"Mozilla/5.0 (Macintosh; Intel Mac OS X 12_0_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Safari/605.1.15",
|
90
|
-
# macOS (ARM)
|
91
|
-
"Mozilla/5.0 (Macintosh; ARM Mac OS X 12_0_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.3 Safari/605.1.15",
|
92
|
-
"Mozilla/5.0 (Macintosh; ARM Mac OS X 12_0_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Safari/605.1.15",
|
93
|
-
# iOS Devices
|
94
|
-
"Mozilla/5.0 (iPad; CPU OS 15_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1",
|
95
|
-
"Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1",
|
96
|
-
# Android Devices
|
97
|
-
"Mozilla/5.0 (Linux; Android 12; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4891.0 Mobile Safari/537.36",
|
98
|
-
"Mozilla/5.0 (Linux; Android 12; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4893.0 Mobile Safari/537.36",
|
99
|
-
"Mozilla/5.0 (Linux; Android 12; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4895.0 Mobile Safari/537.36",
|
100
|
-
# Smart TVs
|
101
|
-
"Mozilla/5.0 (SMART-TV; LINUX; Tizen 6.0) AppleWebKit/537.36 (KHTML, like Gecko) SmartTV/1.0",
|
102
|
-
"Mozilla/5.0 (SMART-TV; LINUX; Tizen 6.0) AppleWebKit/537.36 (KHTML, like Gecko) WebAppManager/1.0",
|
103
|
-
# Game Consoles
|
104
|
-
"Mozilla/5.0 (PlayStation 5 3.01) AppleWebKit/605.1.15 (KHTML, like Gecko)",
|
105
|
-
"Mozilla/5.0 (Xbox One 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36 Edge/44.18363.8740",
|
106
|
-
]
|
107
|
-
agents = random.choice(user_agents)
|
108
|
-
return agents
|
109
|
-
|
110
|
-
# # Function to change Tor IP address
|
111
|
-
# def renew_tor_ip():
|
112
|
-
# with Controller.from_port(port=9051) as controller:
|
113
|
-
# controller.authenticate()
|
114
|
-
# controller.signal(Signal.NEWNYM)
|
115
|
-
|
116
|
-
# # Function to make requests through Tor
|
117
|
-
# def make_tor_request(url, max_retries=3):
|
118
|
-
# renew_tor_ip()
|
119
|
-
# headers = {"User-Agent": user_agent()}
|
120
|
-
# session = requests.Session()
|
121
|
-
# session.proxies = {"http": "socks5h://localhost:9050", "https": "socks5h://localhost:9050"}
|
122
|
-
|
123
|
-
# for i in range(max_retries):
|
124
|
-
# try:
|
125
|
-
# response = session.get(url, headers=headers, timeout=10)
|
126
|
-
# if response.status_code == 200:
|
127
|
-
# return response.text
|
128
|
-
# except requests.exceptions.RequestException as e:
|
129
|
-
# print(f"Error: {e}")
|
130
|
-
# time.sleep(2) # Add a delay between retries
|
131
|
-
|
132
|
-
# return None
|
133
|
-
|
134
|
-
|
135
|
-
def find_links(url):
|
136
|
-
links_href = [] # Initialize list to store extracted links
|
137
|
-
content_type, content = fetch_all(url)
|
138
|
-
base_url = urlparse(url)
|
139
|
-
links = content.find_all("a", href=True)
|
140
|
-
for link in links:
|
141
|
-
link_href = link["href"]
|
142
|
-
if not link_href.startswith(('http://', 'https://')):
|
143
|
-
# Convert relative links to absolute links
|
144
|
-
link_href = urljoin(base_url.geturl(), link_href)
|
145
|
-
links_href.append(link_href)
|
146
|
-
return links_href
|
147
|
-
|
148
|
-
def find_domain(links):
|
149
|
-
domains = [urlparse(link).netloc for link in links]
|
150
|
-
domain_counts = Counter(domains)
|
151
|
-
most_common_domain = domain_counts.most_common(1)[0][0]
|
152
|
-
# print(f"Most_frequent_domain:{most_common_domain}")
|
153
|
-
return most_common_domain
|
154
|
-
|
155
|
-
# To determine which links are related to target domains(e.g., pages) you are interested in
|
156
|
-
def filter_links(links, domain=None, kind='html'):
|
157
|
-
filtered_links = []
|
158
|
-
if isinstance(kind, (str, list)):
|
159
|
-
kind = tuple(kind)
|
160
|
-
if domain is None:
|
161
|
-
domain = find_domain(links)
|
162
|
-
for link in links:
|
163
|
-
parsed_link = urlparse(link)
|
164
|
-
if parsed_link.netloc == domain and parsed_link.path.endswith(kind) and 'javascript:' not in parsed_link:
|
165
|
-
filtered_links.append(link)
|
166
|
-
return filtered_links
|
167
|
-
|
168
|
-
def find_img(url, dir_save="images"):
|
169
|
-
"""
|
170
|
-
Save images referenced in HTML content locally.
|
171
|
-
Args:
|
172
|
-
content (str or BeautifulSoup): HTML content or BeautifulSoup object.
|
173
|
-
url (str): URL of the webpage.
|
174
|
-
content_type (str): Type of content. Default is "html".
|
175
|
-
dir_save (str): Directory to save images. Default is "images".
|
176
|
-
Returns:
|
177
|
-
str: HTML content with updated image URLs pointing to local files.
|
178
|
-
"""
|
179
|
-
content_type, content = fetch_all(url)
|
180
|
-
if "html" in content_type.lower():
|
181
|
-
# Create the directory if it doesn't exist
|
182
|
-
os.makedirs(dir_save, exist_ok=True)
|
183
|
-
|
184
|
-
# Parse HTML content if it's not already a BeautifulSoup object
|
185
|
-
if isinstance(content, str):
|
186
|
-
content = BeautifulSoup(content, "html.parser")
|
187
|
-
image_links=[]
|
188
|
-
# Extracting images
|
189
|
-
images = content.find_all("img", src=True)
|
190
|
-
for i, image in enumerate(images):
|
191
|
-
try:
|
192
|
-
# Get the image URL
|
193
|
-
image_url = image["src"]
|
194
|
-
|
195
|
-
if image_url.startswith("data:image"):
|
196
|
-
# Extract the image data from the data URI
|
197
|
-
mime_type, base64_data = image_url.split(",", 1)
|
198
|
-
# Determine the file extension from the MIME type
|
199
|
-
if ":" in mime_type:
|
200
|
-
# image_extension = mime_type.split(":")[1].split(";")[0]
|
201
|
-
image_extension = mime_type.split(":")[1].split(";")[0].split("/")[-1]
|
202
|
-
else:
|
203
|
-
image_extension = "png" # Default to PNG if extension is not specified
|
204
|
-
# if 'svg+xml' in image_extension:
|
205
|
-
# image_extension='svg'
|
206
|
-
image_data = base64.b64decode(base64_data)
|
207
|
-
# Save the image data to a file
|
208
|
-
image_filename = os.path.join(
|
209
|
-
dir_save, f"image_{i}.{image_extension}"
|
210
|
-
)
|
211
|
-
with open(image_filename, "wb") as image_file:
|
212
|
-
image_file.write(image_data)
|
213
|
-
|
214
|
-
# Update the src attribute of the image tag to point to the local file
|
215
|
-
image["src"] = image_filename
|
216
|
-
else:
|
217
|
-
# Construct the absolute image URL
|
218
|
-
absolute_image_url = urljoin(url, image_url)
|
219
|
-
|
220
|
-
# Parse the image URL to extract the file extension
|
221
|
-
parsed_url = urlparse(absolute_image_url)
|
222
|
-
image_extension = os.path.splitext(parsed_url.path)[1]
|
223
|
-
|
224
|
-
# Download the image
|
225
|
-
image_response = requests.get(absolute_image_url)
|
226
|
-
|
227
|
-
# Save the image to a file
|
228
|
-
image_filename = os.path.join(
|
229
|
-
dir_save, f"image_{i}{image_extension}"
|
230
|
-
)
|
231
|
-
with open(image_filename, "wb") as image_file:
|
232
|
-
image_file.write(image_response.content)
|
233
|
-
|
234
|
-
# Update the src attribute of the image tag to point to the local file
|
235
|
-
image["src"] = image_filename
|
236
|
-
except (requests.RequestException, KeyError) as e:
|
237
|
-
print(f"Failed to process image {image_url}: {e}")
|
238
|
-
print(f"images were saved at\n{dir_save}")
|
239
|
-
# Return the HTML content with updated image URLs
|
240
|
-
return content
|
241
|
-
|
242
|
-
def content_div_class(content, div="div", div_class="highlight"):
|
243
|
-
texts = [div.text for div in content.find_all(div, class_=div_class)]
|
244
|
-
return texts
|
245
|
-
def find(url, where="div", what="highlight"):
|
246
|
-
_,content = fetch_all(url, parser="html.parser")
|
247
|
-
texts = [div.text for div in content.find_all(where, class_=what)]
|
248
|
-
return texts
|
249
|
-
# usage example:
|
250
|
-
#### img2local(url, "/Users/macjianfeng/Desktop/@tmp/dd/")
|
251
|
-
def find_forms(url):
|
252
|
-
content_type, content = fetch_all(url)
|
253
|
-
df=pd.DataFrame()
|
254
|
-
# Extracting forms and inputs
|
255
|
-
forms = content.find_all("form")
|
256
|
-
form_data = []
|
257
|
-
for form in forms:
|
258
|
-
form_inputs = form.find_all("input")
|
259
|
-
input_data = {}
|
260
|
-
for input_tag in form_inputs:
|
261
|
-
input_type = input_tag.get("type")
|
262
|
-
input_name = input_tag.get("name")
|
263
|
-
input_value = input_tag.get("value")
|
264
|
-
input_data[input_name] = {"type": input_type, "value": input_value}
|
265
|
-
form_data.append(input_data)
|
266
|
-
return form_data
|
267
|
-
# to clean strings
|
268
|
-
def clean_string(value):
|
269
|
-
if isinstance(value, str):
|
270
|
-
return value.replace('\n', '').replace('\r', '').replace('\t', '')
|
271
|
-
else:
|
272
|
-
return value
|
273
|
-
def find_all(url, dir_save=None):
|
274
|
-
content_type, content = fetch_all(url)
|
275
|
-
|
276
|
-
# Extracting paragraphs
|
277
|
-
paragraphs_text = [paragraph.text for paragraph in content.find_all("p")]
|
278
|
-
|
279
|
-
# Extracting specific elements by class
|
280
|
-
specific_elements_text = [element.text for element in content.find_all(class_="specific-class")]
|
281
|
-
|
282
|
-
# Extracting links (anchor tags)
|
283
|
-
links_href = find_links(url)
|
284
|
-
links_href = filter_links(links_href)
|
285
|
-
|
286
|
-
# Extracting images
|
287
|
-
images_src = [image['src'] for image in content.find_all("img", src=True)]
|
288
|
-
|
289
|
-
# Extracting headings (h1, h2, h3, etc.)
|
290
|
-
headings = [f'h{i}' for i in range(1, 7)]
|
291
|
-
headings_text = {heading: [tag.text for tag in content.find_all(heading)] for heading in headings}
|
292
|
-
|
293
|
-
# Extracting lists (ul, ol, li)
|
294
|
-
list_items_text = [item.text for list_ in content.find_all(["ul", "ol"]) for item in list_.find_all("li")]
|
295
|
-
|
296
|
-
# Extracting tables (table, tr, td)
|
297
|
-
table_cells_text = [cell.text for table in content.find_all("table") for row in table.find_all("tr") for cell in row.find_all("td")]
|
298
|
-
|
299
|
-
# Extracting other elements
|
300
|
-
divs_content = [div.text.strip() for div in content.find_all("div")]
|
301
|
-
headers_footer_content = [tag.text for tag in content.find_all(["header", "footer"])]
|
302
|
-
meta_tags_content = [(tag.name, tag.attrs) for tag in content.find_all("meta")]
|
303
|
-
spans_content = [span.text for span in content.find_all("span")]
|
304
|
-
bold_text_content = [text.text for text in content.find_all("b")]
|
305
|
-
italic_text_content = [text.text for text in content.find_all("i")]
|
306
|
-
code_snippets_content = [code.text for code in content.find_all("code")]
|
307
|
-
blockquotes_content = [blockquote.text for blockquote in content.find_all("blockquote")]
|
308
|
-
preformatted_text_content = [pre.text for pre in content.find_all("pre")]
|
309
|
-
buttons_content = [button.text for button in content.find_all("button")]
|
310
|
-
navs_content = [nav.text for nav in content.find_all("nav")]
|
311
|
-
sections_content = [section.text for section in content.find_all("section")]
|
312
|
-
articles_content = [article.text for article in content.find_all("article")]
|
313
|
-
figures_content = [figure.text for figure in content.find_all("figure")]
|
314
|
-
captions_content = [caption.text for caption in content.find_all("figcaption")]
|
315
|
-
abbreviations_content = [abbr.text for abbr in content.find_all("abbr")]
|
316
|
-
definitions_content = [definition.text for definition in content.find_all("dfn")]
|
317
|
-
addresses_content = [address.text for address in content.find_all("address")]
|
318
|
-
time_elements_content = [time.text for time in content.find_all("time")]
|
319
|
-
progress_content = [progress.text for progress in content.find_all("progress")]
|
320
|
-
meter_content = [meter.text for meter in content.find_all("meter")]
|
321
|
-
forms = find_forms(url)
|
322
|
-
|
323
|
-
lists_to_fill = [
|
324
|
-
paragraphs_text, specific_elements_text, links_href, images_src,
|
325
|
-
headings_text["h1"], headings_text["h2"], headings_text["h3"], headings_text["h4"],
|
326
|
-
headings_text["h5"], headings_text["h6"], list_items_text, table_cells_text,
|
327
|
-
divs_content, headers_footer_content, meta_tags_content, spans_content,
|
328
|
-
bold_text_content, italic_text_content, code_snippets_content,
|
329
|
-
blockquotes_content, preformatted_text_content, buttons_content,
|
330
|
-
navs_content, sections_content, articles_content, figures_content,
|
331
|
-
captions_content, abbreviations_content, definitions_content,
|
332
|
-
addresses_content, time_elements_content, progress_content,
|
333
|
-
meter_content,forms
|
334
|
-
]
|
335
|
-
# add new features
|
336
|
-
script_texts=content_div_class(content, div="div", div_class="highlight")
|
337
|
-
lists_to_fill.append(script_texts)
|
338
|
-
|
339
|
-
audio_src = [audio['src'] for audio in content.find_all("audio", src=True)]
|
340
|
-
video_src = [video['src'] for video in content.find_all("video", src=True)]
|
341
|
-
iframe_src = [iframe['src'] for iframe in content.find_all("iframe", src=True)]
|
342
|
-
lists_to_fill.extend([audio_src, video_src, iframe_src])
|
343
|
-
|
344
|
-
rss_links = [link['href'] for link in content.find_all('link', type=['application/rss+xml', 'application/atom+xml'])]
|
345
|
-
lists_to_fill.append(rss_links)
|
346
|
-
|
347
|
-
# Find the maximum length among all lists
|
348
|
-
max_length = max(len(lst) for lst in lists_to_fill)
|
349
|
-
|
350
|
-
# Fill missing data with empty strings for each list
|
351
|
-
for lst in lists_to_fill:
|
352
|
-
lst += [""] * (max_length - len(lst))
|
353
|
-
|
354
|
-
# Create DataFrame
|
355
|
-
df = pd.DataFrame({
|
356
|
-
"headings1": headings_text["h1"],
|
357
|
-
"headings2": headings_text["h2"],
|
358
|
-
"headings3": headings_text["h3"],
|
359
|
-
"headings4": headings_text["h4"],
|
360
|
-
"headings5": headings_text["h5"],
|
361
|
-
"headings6": headings_text["h6"],
|
362
|
-
"paragraphs": paragraphs_text,
|
363
|
-
"list_items": list_items_text,
|
364
|
-
"table_cells": table_cells_text,
|
365
|
-
"headers_footer": headers_footer_content,
|
366
|
-
"meta_tags": meta_tags_content,
|
367
|
-
"spans": spans_content,
|
368
|
-
"bold_text": bold_text_content,
|
369
|
-
"italic_text": italic_text_content,
|
370
|
-
"code_snippets": code_snippets_content,
|
371
|
-
"blockquotes": blockquotes_content,
|
372
|
-
"preformatted_text": preformatted_text_content,
|
373
|
-
"buttons": buttons_content,
|
374
|
-
"navs": navs_content,
|
375
|
-
"sections": sections_content,
|
376
|
-
"articles": articles_content,
|
377
|
-
"figures": figures_content,
|
378
|
-
"captions": captions_content,
|
379
|
-
"abbreviations": abbreviations_content,
|
380
|
-
"definitions": definitions_content,
|
381
|
-
"addresses": addresses_content,
|
382
|
-
"time_elements": time_elements_content,
|
383
|
-
"progress": progress_content,
|
384
|
-
"specific_elements": specific_elements_text,
|
385
|
-
"meter": meter_content,
|
386
|
-
"forms":forms,
|
387
|
-
"scripts":script_texts,
|
388
|
-
"audio":audio_src,
|
389
|
-
"video":video_src,
|
390
|
-
"iframe":iframe_src,
|
391
|
-
"rss": rss_links,
|
392
|
-
"images": images_src,
|
393
|
-
"links": links_href,
|
394
|
-
"divs": divs_content,
|
395
|
-
})
|
396
|
-
# to remove the '\n\t\r'
|
397
|
-
df=df.apply(lambda x: x.map(clean_string) if x.dtype == "object" else x) # df=df.applymap(clean_string)
|
398
|
-
if dir_save:
|
399
|
-
if not dir_save.endswith(".csv"):
|
400
|
-
dir_save=dir_save+"_df.csv"
|
401
|
-
df.to_csv(dir_save)
|
402
|
-
else:
|
403
|
-
df.to_csv(dir_save)
|
404
|
-
print(f"file has been saved at\n{dir_save}")
|
405
|
-
return df
|
py2ls/version.py
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
version = "0.0.1"
|
File without changes
|