py2ls 0.2.4.32__py3-none-any.whl → 0.2.4.34__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
py2ls/netfinder.py CHANGED
@@ -1,4 +1,5 @@
1
1
  from bs4 import BeautifulSoup
2
+ import scrapy
2
3
  import requests
3
4
  import os
4
5
  import pandas as pd
@@ -332,6 +333,94 @@ def parse_cookies(cookies_str):
332
333
 
333
334
  return cookies_dict
334
335
 
336
+ class FetchSpider(scrapy.Spider):
337
+ name = "fetch_spider"
338
+
339
+ def __init__(self, url, parser="html.parser", cookies=None, headers=None, *args, **kwargs):
340
+ super(FetchSpider, self).__init__(*args, **kwargs)
341
+ self.start_urls = [url]
342
+ self.cookies = cookies
343
+ self.headers = headers
344
+ self.parser = parser
345
+
346
+ def start_requests(self):
347
+ for url in self.start_urls:
348
+ yield scrapy.Request(
349
+ url,
350
+ cookies=self.cookies,
351
+ headers=self.headers,
352
+ callback=self.parse
353
+ )
354
+
355
+ def parse(self, response):
356
+ # Use the desired parser (default: html.parser)
357
+ from bs4 import BeautifulSoup
358
+ soup = BeautifulSoup(response.text, self.parser)
359
+ yield {"content": soup}
360
+
361
+
362
+ def fetch_scrapy(
363
+ url,
364
+ parser="html.parser",
365
+ cookies=None,
366
+ headers=None,
367
+ settings=None,
368
+ ):
369
+ """
370
+ Fetches content using Scrapy.
371
+
372
+ Args:
373
+ url (str): The URL to scrape.
374
+ parser (str): Parser for BeautifulSoup (e.g., "lxml", "html.parser").
375
+ cookies (dict): Cookies to pass in the request.
376
+ headers (dict): HTTP headers for the request.
377
+ settings (dict): Scrapy settings, if any.
378
+
379
+ Returns:
380
+ dict: Parsed content as a dictionary.
381
+ """
382
+ from scrapy.utils.project import get_project_settings
383
+ from scrapy.crawler import CrawlerProcess
384
+ from scrapy.signalmanager import dispatcher
385
+ from scrapy import signals
386
+ import scrapy
387
+
388
+ # Container for scraped content
389
+ content = []
390
+
391
+ # Callback function for item scraped signal
392
+ def handle_item(item, response, spider):
393
+ content.append(item["content"])
394
+
395
+ # Scrapy settings
396
+ process_settings = settings or get_project_settings()
397
+ process_settings.update(
398
+ {
399
+ "USER_AGENT": "CustomUserAgent/1.0", # Use a custom user agent
400
+ "DOWNLOAD_DELAY": 1, # Prevent overloading servers
401
+ "COOKIES_ENABLED": bool(cookies),
402
+ "LOG_LEVEL": "ERROR", # Minimize log verbosity
403
+ }
404
+ )
405
+
406
+ # Initialize and configure Scrapy process
407
+ process = CrawlerProcess(settings=process_settings)
408
+ dispatcher.connect(handle_item, signal=signals.item_scraped)
409
+
410
+ # Start the Scrapy crawl
411
+ process.crawl(
412
+ FetchSpider,
413
+ url=url,
414
+ parser=parser,
415
+ cookies=cookies,
416
+ headers=headers,
417
+ )
418
+ process.start() # Blocks until all crawls are finished
419
+
420
+ # Return the first scraped content or None if empty
421
+ return content[0] if content else None
422
+
423
+
335
424
  def fetch_all(
336
425
  url,
337
426
  parser="lxml",
@@ -558,6 +647,16 @@ def fetch_all(
558
647
  else:
559
648
  logger.warning("Selenium could not fetch content")
560
649
  return None, None
650
+ elif 'scr' in driver.lower():
651
+ settings = {
652
+ "USER_AGENT": user_agent(),
653
+ "DOWNLOAD_DELAY": 1, # Prevent overloading the server
654
+ "COOKIES_ENABLED": True if cookies else False,
655
+ "LOG_LEVEL": "WARNING", # Reduce log verbosity
656
+ }
657
+ content=fetch_scrapy(url, parser=parser, cookies=cookies, headers=headers, settings=settings)
658
+ return parser, content
659
+
561
660
  except requests.RequestException as e:
562
661
  logger.error(f"Error fetching URL '{url}': {e}")
563
662
  return None, None
py2ls/ocr.py CHANGED
@@ -1,24 +1,15 @@
1
- import easyocr
1
+
2
2
  import cv2
3
3
  import numpy as np
4
4
  import matplotlib.pyplot as plt
5
5
  from py2ls.ips import (
6
6
  strcmp,
7
7
  detect_angle,
8
- ) # Ensure this function is defined in your 'ips' module
9
- from spellchecker import SpellChecker
10
- import re
11
-
12
- from PIL import Image, ImageDraw, ImageFont
13
- import PIL.PngImagePlugin
14
- import pytesseract
15
- from paddleocr import PaddleOCR
8
+ str2words,
9
+ isa
10
+ )
16
11
  import logging
17
12
 
18
- logging.getLogger("ppocr").setLevel(
19
- logging.WARNING
20
- ) # or logging.ERROR to show only error messages
21
-
22
13
  """
23
14
  Optical Character Recognition (OCR)
24
15
  """
@@ -285,10 +276,12 @@ def add_text_pil(
285
276
  image,
286
277
  text,
287
278
  position,
279
+ cvt_cmp=True,
288
280
  font_size=12,
289
281
  color=(0, 0, 0),
290
282
  bg_color=(133, 203, 245, 100),
291
283
  ):
284
+ from PIL import Image, ImageDraw, ImageFont
292
285
  # Convert the image to PIL format
293
286
  pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)).convert("RGBA")
294
287
  # Define the font (make sure to use a font that supports Chinese characters)
@@ -337,7 +330,7 @@ def add_text_pil(
337
330
  overlay = overlay.convert("RGBA")
338
331
  combined = Image.alpha_composite(pil_image, overlay)
339
332
  # Convert the image back to OpenCV format
340
- image = cv2.cvtColor(np.array(combined), cv2.COLOR_RGBA2BGR)
333
+ image = cv2.cvtColor(np.array(combined), cv2.COLOR_RGBA2BGR) #if cvt_cmp else np.array(combined)
341
334
  return image
342
335
 
343
336
 
@@ -348,7 +341,7 @@ def preprocess_img(
348
341
  threshold_method="adaptive",
349
342
  rotate="auto",
350
343
  skew=False,
351
- blur=True,
344
+ blur=False,#True,
352
345
  blur_ksize=(5, 5),
353
346
  morph=True,
354
347
  morph_op="open",
@@ -384,12 +377,14 @@ def preprocess_img(
384
377
  clahe_grid_size: CLAHE 的网格大小。
385
378
  edge_detection: 是否进行边缘检测。
386
379
  """
380
+ import PIL.PngImagePlugin
387
381
  if isinstance(image, PIL.PngImagePlugin.PngImageFile):
388
382
  image = np.array(image)
389
383
  if isinstance(image, str):
390
384
  image = cv2.imread(image)
391
385
  if not isinstance(image, np.ndarray):
392
386
  image = np.array(image)
387
+
393
388
  try:
394
389
  if image.shape[1] == 4: # Check if it has an alpha channel
395
390
  # Drop the alpha channel (if needed), or handle it as required
@@ -507,6 +502,8 @@ def text_postprocess(
507
502
  pattern=None,
508
503
  merge=True,
509
504
  ):
505
+ import re
506
+ from spellchecker import SpellChecker
510
507
 
511
508
  def correct_spelling(text_list):
512
509
  spell = SpellChecker()
@@ -531,9 +528,9 @@ def text_postprocess(
531
528
  return merged_text
532
529
 
533
530
  results = text
534
- print(results)
535
531
  if spell_check:
536
- results = correct_spelling(results)
532
+ # results = correct_spelling(results)
533
+ results=str2words(results)
537
534
  if clean:
538
535
  results = clean_text(results)
539
536
  if filter:
@@ -552,42 +549,39 @@ def get_text(
552
549
  image,
553
550
  lang=["ch_sim", "en"],
554
551
  model="paddleocr", # "pytesseract","paddleocr","easyocr"
555
- thr=0.1,
552
+ thr=0.1,
556
553
  gpu=True,
557
554
  decoder="wordbeamsearch", #'greedy', 'beamsearch' and 'wordbeamsearch'(hightly accurate)
558
555
  output="txt",
559
556
  preprocess=None,
560
- postprocess="not ready",
557
+ postprocess=False,# do not check spell
561
558
  show=True,
562
559
  ax=None,
563
560
  cmap=cv2.COLOR_BGR2RGB, # draw_box
564
- font=cv2.FONT_HERSHEY_SIMPLEX,
565
- font_scale=0.8,
566
- thickness_text=2, # Line thickness of 2 px
567
- box_color=(0, 255, 0), # draw_box
568
- font_color=(0, 0, 0),
569
- bg_color=(133, 203, 245, 100),
561
+ font=cv2.FONT_HERSHEY_SIMPLEX,# draw_box
562
+ fontsize=8,# draw_box
563
+ figsize=[10,10],
564
+ box_color = (0, 255, 0), # draw_box
565
+ fontcolor = (0, 0, 0),# draw_box
566
+ bg_color=(133, 203, 245, 100),# draw_box
570
567
  usage=False,
571
568
  **kwargs,
572
569
  ):
573
570
  """
574
- 功能: 该函数使用 EasyOCR 进行文本识别,并允许自定义图像预处理步骤和结果展示。
575
- 参数:
576
- image: 输入的图像路径或图像数据。
577
- lang: OCR 语言列表。
578
- thr: 置信度阈值,低于此阈值的检测结果将被过滤。
579
- gpu: 是否使用 GPU。
580
- output: 输出类型,可以是 'all'(返回所有检测结果)、'text'(返回文本)、'score'(返回置信度分数)、'box'(返回边界框)。
581
- preprocess: 预处理参数字典,传递给 preprocess_img 函数。
582
- show: 是否显示结果图像。
583
- ax: 用于显示图像的 Matplotlib 子图。
584
- cmap: 用于显示图像的颜色映射。
585
- box_color: 边界框的颜色。
586
- font_color: 文本的颜色。
587
- kwargs: 传递给 EasyOCR readtext 函数的其他参数。
588
-
589
- # Uage
571
+ image: 输入的图像路径或图像数据。
572
+ lang: OCR 语言列表。
573
+ thr: 置信度阈值,低于此阈值的检测结果将被过滤。
574
+ gpu: 是否使用 GPU。
575
+ output: 输出类型,可以是 'all'(返回所有检测结果)、'text'(返回文本)、'score'(返回置信度分数)、'box'(返回边界框)。
576
+ preprocess: 预处理参数字典,传递给 preprocess_img 函数。
577
+ show: 是否显示结果图像。
578
+ ax: 用于显示图像的 Matplotlib 子图。
579
+ cmap: 用于显示图像的颜色映射。
580
+ box_color: 边界框的颜色。
581
+ fontcolor: 文本的颜色。
582
+ kwargs: 传递给 EasyOCR readtext 函数的其他参数。
590
583
  """
584
+ from PIL import Image
591
585
  if usage:
592
586
  print(
593
587
  """
@@ -612,16 +606,19 @@ def get_text(
612
606
  "edge_detection": False
613
607
  },
614
608
  adjust_contrast=0.7
615
- )
616
- """
617
- )
609
+ )""")
618
610
 
619
- models = ["easyocr", "paddleocr", "pytesseract","ddddocr"]
611
+ models = ["easyocr", "paddleocr", "pytesseract","ddddocr","zerox"]
620
612
  model = strcmp(model, models)[0]
621
613
  lang = lang_auto_detect(lang, model)
622
- if isinstance(image, str):
623
- dir_img=image
614
+ cvt_cmp=True
615
+ if isinstance(image, str) and isa(image,'file'):
624
616
  image = cv2.imread(image)
617
+ elif isa(image,'image'):
618
+ cvt_cmp=False
619
+ image = np.array(image)
620
+ else:
621
+ raise ValueError(f"not support image with {type(image)} type")
625
622
 
626
623
  # Ensure lang is always a list
627
624
  if isinstance(lang, str):
@@ -631,110 +628,96 @@ def get_text(
631
628
  if preprocess is None:
632
629
  preprocess = {}
633
630
  image_process = preprocess_img(image, **preprocess)
631
+ plt.figure(figsize=figsize) if show else None
632
+ # plt.subplot(131)
633
+ # plt.imshow(cv2.cvtColor(image, cmap)) if cvt_cmp else plt.imshow(image)
634
+ # plt.subplot(132)
635
+ # plt.imshow(image_process)
636
+ # plt.subplot(133)
634
637
  if "easy" in model.lower():
638
+ import easyocr
635
639
  print(f"detecting language(s):{lang}")
636
640
  # Perform OCR on the image
637
641
  reader = easyocr.Reader(lang, gpu=gpu)
638
642
  detections = reader.readtext(image_process, decoder=decoder, **kwargs)
639
- if postprocess is None:
640
- postprocess = dict(
641
- spell_check=True,
642
- clean=True,
643
- filter=dict(min_length=2),
644
- pattern=None,
645
- merge=True,
646
- )
647
- text_corr = []
648
- [
649
- text_corr.extend(text_postprocess(text, **postprocess))
650
- for _, text, _ in detections
651
- ]
643
+
644
+ text_corr = []
645
+ for _, text, _ in detections:
646
+ text_corr.append(text_postprocess(text) if postprocess else text)
647
+
652
648
  if show:
653
649
  if ax is None:
654
650
  ax = plt.gca()
655
- for bbox, text, score in detections:
651
+ for i, (bbox, text, score) in enumerate(detections):
656
652
  if score > thr:
657
653
  top_left = tuple(map(int, bbox[0]))
658
654
  bottom_right = tuple(map(int, bbox[2]))
659
- image = cv2.rectangle(image, top_left, bottom_right, box_color, 2)
660
- # image = cv2.putText(
661
- # image, text, top_left, font, font_scale, font_color, thickness_text
662
- # )
655
+ image = cv2.rectangle(image, top_left, bottom_right, box_color, 2)
663
656
  image = add_text_pil(
664
657
  image,
665
- text,
658
+ text_corr[i],
666
659
  top_left,
667
- font_size=font_scale * 32,
668
- color=font_color,
660
+ cvt_cmp=cvt_cmp,
661
+ font_size=fontsize *6,
662
+ color=fontcolor,
669
663
  )
670
- # img_cmp = cv2.cvtColor(image, cmap)
671
- ax.imshow(image)
664
+ try:
665
+ img_cmp = cv2.cvtColor(image, cmap) if cvt_cmp else image
666
+ except:
667
+ img_cmp=image
668
+
669
+ ax.imshow(img_cmp) if cvt_cmp else ax.imshow(image)
672
670
  ax.axis("off")
673
- # plt.show()
674
- # 根据输出类型返回相应的结果
671
+
675
672
  if output == "all":
676
673
  return ax, detections
677
674
  elif "t" in output.lower() and "x" in output.lower():
678
- # 提取文本,过滤低置信度的结果
679
675
  text = [text_ for _, text_, score_ in detections if score_ >= thr]
680
676
  if postprocess:
681
677
  return ax, text
682
678
  else:
683
679
  return text_corr
684
680
  elif "score" in output.lower() or "prob" in output.lower():
685
- # 提取分数
686
681
  scores = [score_ for _, _, score_ in detections]
687
682
  return ax, scores
688
683
  elif "box" in output.lower():
689
- # 提取边界框,过滤低置信度的结果
690
684
  bboxes = [bbox_ for bbox_, _, score_ in detections if score_ >= thr]
691
685
  return ax, bboxes
692
686
  else:
693
- # 默认返回所有检测信息
694
687
  return ax, detections
695
688
  else:
696
- # 根据输出类型返回相应的结果
697
689
  if output == "all":
698
690
  return detections
699
691
  elif "t" in output.lower() and "x" in output.lower():
700
- # 提取文本,过滤低置信度的结果
701
692
  text = [text_ for _, text_, score_ in detections if score_ >= thr]
702
693
  return text
703
694
  elif "score" in output.lower() or "prob" in output.lower():
704
- # 提取分数
705
695
  scores = [score_ for _, _, score_ in detections]
706
696
  return scores
707
697
  elif "box" in output.lower():
708
- # 提取边界框,过滤低置信度的结果
709
698
  bboxes = [bbox_ for bbox_, _, score_ in detections if score_ >= thr]
710
699
  return bboxes
711
700
  else:
712
- # 默认返回所有检测信息
713
701
  return detections
714
702
  elif "pad" in model.lower():
703
+ from paddleocr import PaddleOCR
704
+ logging.getLogger("ppocr").setLevel(logging.ERROR)
705
+
706
+ lang=strcmp(lang, ['ch','en','french','german','korean','japan'])[0]
715
707
  ocr = PaddleOCR(
716
708
  use_angle_cls=True,
717
709
  cls=True,
710
+ lang=lang
718
711
  ) # PaddleOCR supports only one language at a time
719
- result = ocr.ocr(image_process, **kwargs)
712
+ cls=kwargs.pop('cls',True)
713
+ result = ocr.ocr(image_process,cls=cls, **kwargs)
720
714
  detections = []
721
715
  if result[0] is not None:
722
716
  for line in result[0]:
723
717
  bbox, (text, score) = line
718
+ text = str2words(text) if postprocess else text # check spell
724
719
  detections.append((bbox, text, score))
725
- if postprocess is None:
726
- postprocess = dict(
727
- spell_check=True,
728
- clean=True,
729
- filter=dict(min_length=2),
730
- pattern=None,
731
- merge=True,
732
- )
733
- text_corr = []
734
- [
735
- text_corr.extend(text_postprocess(text, **postprocess))
736
- for _, text, _ in detections
737
- ]
720
+
738
721
  if show:
739
722
  if ax is None:
740
723
  ax = plt.gca()
@@ -746,60 +729,48 @@ def get_text(
746
729
  ) # Bottom-left for more accurate placement
747
730
  bottom_right = tuple(map(int, bbox[2]))
748
731
  image = cv2.rectangle(image, top_left, bottom_right, box_color, 2)
749
- # image = cv2.putText(
750
- # image, text, top_left, font, font_scale, font_color, thickness_text
751
- # )
752
732
  image = add_text_pil(
753
733
  image,
754
734
  text,
755
735
  top_left,
756
- font_size=font_scale * 32,
757
- color=font_color,
736
+ cvt_cmp=cvt_cmp,
737
+ font_size=fontsize *6,
738
+ color=fontcolor,
758
739
  bg_color=bg_color,
759
740
  )
760
- img_cmp = cv2.cvtColor(image, cmap)
761
- ax.imshow(image)
741
+ try:
742
+ img_cmp = cv2.cvtColor(image, cmap) if cvt_cmp else image
743
+ except:
744
+ img_cmp = image
745
+
746
+ ax.imshow(img_cmp)
762
747
  ax.axis("off")
763
- # plt.show()
764
- # 根据输出类型返回相应的结果
765
748
  if output == "all":
766
749
  return ax, detections
767
750
  elif "t" in output.lower() and "x" in output.lower():
768
- # 提取文本,过滤低置信度的结果
769
751
  text = [text_ for _, text_, score_ in detections if score_ >= thr]
770
- if postprocess:
771
- return ax, text
772
- else:
773
- return text_corr
752
+ return ax, text
774
753
  elif "score" in output.lower() or "prob" in output.lower():
775
- # 提取分数
776
754
  scores = [score_ for _, _, score_ in detections]
777
755
  return ax, scores
778
756
  elif "box" in output.lower():
779
- # 提取边界框,过滤低置信度的结果
780
757
  bboxes = [bbox_ for bbox_, _, score_ in detections if score_ >= thr]
781
758
  return ax, bboxes
782
759
  else:
783
- # 默认返回所有检测信息
784
760
  return ax, detections
785
761
  else:
786
- # 根据输出类型返回相应的结果
787
762
  if output == "all":
788
763
  return detections
789
764
  elif "t" in output.lower() and "x" in output.lower():
790
- # 提取文本,过滤低置信度的结果
791
765
  text = [text_ for _, text_, score_ in detections if score_ >= thr]
792
766
  return text
793
767
  elif "score" in output.lower() or "prob" in output.lower():
794
- # 提取分数
795
768
  scores = [score_ for _, _, score_ in detections]
796
769
  return scores
797
770
  elif "box" in output.lower():
798
- # 提取边界框,过滤低置信度的结果
799
771
  bboxes = [bbox_ for bbox_, _, score_ in detections if score_ >= thr]
800
772
  return bboxes
801
773
  else:
802
- # 默认返回所有检测信息
803
774
  return detections
804
775
  elif "ddddocr" in model.lower():
805
776
  import ddddocr
@@ -844,7 +815,51 @@ def get_text(
844
815
  ax.imshow(image_vis)
845
816
  ax.axis("off")
846
817
  return detections
818
+
819
+ elif "zerox" in model.lower():
820
+ from pyzerox import zerox
821
+ result = zerox(image_process)
822
+ detections = [(bbox, text, score) for bbox, text, score in result]
823
+ # Postprocess and visualize
824
+ if postprocess is None:
825
+ postprocess = dict(
826
+ spell_check=True,
827
+ clean=True,
828
+ filter=dict(min_length=2),
829
+ pattern=None,
830
+ merge=True,
831
+ )
832
+ text_corr = [text_postprocess(text, **postprocess) for _, text, _ in detections]
833
+
834
+ # Display results if 'show' is True
835
+ if show:
836
+ if ax is None:
837
+ ax = plt.gca()
838
+ for bbox, text, score in detections:
839
+ if score > thr:
840
+ top_left = tuple(map(int, bbox[0]))
841
+ bottom_right = tuple(map(int, bbox[2]))
842
+ image = cv2.rectangle(image, top_left, bottom_right, box_color, 2)
843
+ image = add_text_pil(image, text, top_left, cvt_cmp=cvt_cmp,font_size=fontsize *6, color=fontcolor, bg_color=bg_color)
844
+ ax.imshow(image)
845
+ ax.axis("off")
846
+
847
+ # Return result based on 'output' type
848
+ if output == "all":
849
+ return ax, detections
850
+ elif "t" in output.lower() and "x" in output.lower():
851
+ text = [text_ for _, text_, score_ in detections if score_ >= thr]
852
+ return ax, text
853
+ elif "score" in output.lower() or "prob" in output.lower():
854
+ scores = [score_ for _, _, score_ in detections]
855
+ return ax, scores
856
+ elif "box" in output.lower():
857
+ bboxes = [bbox_ for bbox_, _, score_ in detections if score_ >= thr]
858
+ return ax, bboxes
859
+ else:
860
+ return detections
847
861
  else: # "pytesseract"
862
+ import pytesseract
848
863
  if ax is None:
849
864
  ax = plt.gca()
850
865
  text = pytesseract.image_to_string(image_process, lang="+".join(lang), **kwargs)
@@ -869,8 +884,9 @@ def get_text(
869
884
  image,
870
885
  char,
871
886
  left,
872
- font_size=font_scale * 32,
873
- color=font_color,
887
+ cvt_cmp=cvt_cmp,
888
+ font_size=fontsize *6,
889
+ color=fontcolor,
874
890
  )
875
891
  img_cmp = cv2.cvtColor(image, cmap)
876
892
  ax.imshow(img_cmp)
@@ -906,8 +922,8 @@ def draw_box(
906
922
  thr=0.25,
907
923
  cmap=cv2.COLOR_BGR2RGB,
908
924
  box_color=(0, 255, 0), # draw_box
909
- font_color=(0, 0, 255), # draw_box
910
- font_scale=0.8,
925
+ fontcolor=(0, 0, 255), # draw_box
926
+ fontsize=8,
911
927
  show=True,
912
928
  ax=None,
913
929
  **kwargs,
@@ -924,12 +940,9 @@ def draw_box(
924
940
  if score > thr:
925
941
  top_left = tuple(map(int, bbox[0]))
926
942
  bottom_right = tuple(map(int, bbox[2]))
927
- image = cv2.rectangle(image, top_left, bottom_right, box_color, 2)
928
- # image = cv2.putText(
929
- # image, text, top_left, font, font_scale, font_color, thickness_text
930
- # )
943
+ image = cv2.rectangle(image, top_left, bottom_right, box_color, 2)
931
944
  image = add_text_pil(
932
- image, text, top_left, font_size=font_scale * 32, color=font_color
945
+ image, text, top_left, cvt_cmp=cvt_cmp,font_size=fontsize *6, color=fontcolor
933
946
  )
934
947
 
935
948
  img_cmp = cv2.cvtColor(image, cmap)