PyPI - doctra - Versions diffs - 0.3.2__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

doctra 0.3.2py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

doctra/__init__.py +4 -0
doctra/cli/main.py +168 -0
doctra/engines/image_restoration/__init__.py +10 -0
doctra/engines/image_restoration/docres_engine.py +566 -0
doctra/engines/vlm/service.py +0 -12
doctra/parsers/enhanced_pdf_parser.py +370 -0
doctra/parsers/structured_pdf_parser.py +11 -60
doctra/parsers/table_chart_extractor.py +8 -44
doctra/third_party/docres/data/MBD/MBD.py +110 -0
doctra/third_party/docres/data/MBD/MBD_utils.py +291 -0
doctra/third_party/docres/data/MBD/infer.py +151 -0
doctra/third_party/docres/data/MBD/model/deep_lab_model/aspp.py +95 -0
doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/__init__.py +13 -0
doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/drn.py +402 -0
doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/mobilenet.py +151 -0
doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/resnet.py +170 -0
doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/xception.py +288 -0
doctra/third_party/docres/data/MBD/model/deep_lab_model/decoder.py +59 -0
doctra/third_party/docres/data/MBD/model/deep_lab_model/deeplab.py +81 -0
doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/__init__.py +12 -0
doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/batchnorm.py +282 -0
doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/comm.py +129 -0
doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/replicate.py +88 -0
doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/unittest.py +29 -0
doctra/third_party/docres/data/preprocess/crop_merge_image.py +142 -0
doctra/third_party/docres/inference.py +370 -0
doctra/third_party/docres/models/restormer_arch.py +308 -0
doctra/third_party/docres/utils.py +464 -0
doctra/ui/app.py +5 -32
doctra/utils/progress.py +13 -98
doctra/utils/structured_utils.py +45 -49
doctra/version.py +1 -1
{doctra-0.3.2.dist-info → doctra-0.4.0.dist-info}/METADATA +1 -1
doctra-0.4.0.dist-info/RECORD +67 -0
doctra-0.3.2.dist-info/RECORD +0 -44
{doctra-0.3.2.dist-info → doctra-0.4.0.dist-info}/WHEEL +0 -0
{doctra-0.3.2.dist-info → doctra-0.4.0.dist-info}/licenses/LICENSE +0 -0
{doctra-0.3.2.dist-info → doctra-0.4.0.dist-info}/top_level.txt +0 -0

doctra/parsers/table_chart_extractor.py CHANGED Viewed

@@ -61,22 +61,17 @@ class ChartTablePDFParser:
     ):
         """
         Initialize the ChartTablePDFParser with extraction configuration.
-        Sets up the layout detection engine and optionally the VLM service
-        for structured data extraction.
-        :param extract_charts: Whether to extract charts from the document
-        :param extract_tables: Whether to extract tables from the document
-        :param use_vlm: Whether to use VLM for structured data extraction
-        :param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter")
+        :param extract_charts: Whether to extract charts from the document (default: True)
+        :param extract_tables: Whether to extract tables from the document (default: True)
+        :param use_vlm: Whether to use VLM for structured data extraction (default: False)
+        :param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
         :param vlm_model: Model name to use (defaults to provider-specific defaults)
-        :param vlm_api_key: API key for VLM provider
-        :param layout_model_name: Layout detection model name
-        :param dpi: DPI for PDF rendering
-        :param min_score: Minimum confidence score for layout detection
-        :raises ValueError: If neither extract_charts nor extract_tables is True
+        :param vlm_api_key: API key for VLM provider (required if use_vlm is True)
+        :param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
+        :param dpi: DPI for PDF rendering (default: 200)
+        :param min_score: Minimum confidence score for layout detection (default: 0.0)
         """
-        # Validation
         if not extract_charts and not extract_tables:
             raise ValueError("At least one of extract_charts or extract_tables must be True")
@@ -98,21 +93,15 @@ class ChartTablePDFParser:
     def parse(self, pdf_path: str, output_base_dir: str = "outputs") -> None:
         """
         Parse a PDF document and extract charts and/or tables.
-        Processes the PDF through layout detection, extracts the specified
-        element types, saves cropped images, and optionally converts them
-        to structured data using VLM.
         :param pdf_path: Path to the input PDF file
         :param output_base_dir: Base directory for output files (default: "outputs")
         :return: None
         """
-        # Create output directory structure: outputs/<filename>/structured_parsing/
         pdf_name = Path(pdf_path).stem
         out_dir = os.path.join(output_base_dir, pdf_name, "structured_parsing")
         os.makedirs(out_dir, exist_ok=True)
-        # Create subdirectories based on what we're extracting
         charts_dir = None
         tables_dir = None
@@ -129,24 +118,20 @@ class ChartTablePDFParser:
         )
         pil_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
-        # Determine which labels to extract
         target_labels = []
         if self.extract_charts:
             target_labels.append("chart")
         if self.extract_tables:
             target_labels.append("table")
-        # Count items for progress bars
         chart_count = sum(sum(1 for b in p.boxes if b.label == "chart") for p in pages) if self.extract_charts else 0
         table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages) if self.extract_tables else 0
-        # Prepare output content
         if self.use_vlm:
             md_lines: List[str] = ["# Extracted Charts and Tables\n"]
             structured_items: List[Dict[str, Any]] = []
             vlm_items: List[Dict[str, Any]] = []
-        # Progress bar descriptions
         charts_desc = "Charts (VLM → table)" if self.use_vlm else "Charts (cropped)"
         tables_desc = "Tables (VLM → table)" if self.use_vlm else "Tables (cropped)"
@@ -154,11 +139,9 @@ class ChartTablePDFParser:
         table_counter = 1
         with ExitStack() as stack:
-            # Enhanced environment detection
             is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
             is_terminal = hasattr(sys.stdout, 'isatty') and sys.stdout.isatty()
-            # Use appropriate progress bars based on environment
             if is_notebook:
                 charts_bar = stack.enter_context(
                     create_notebook_friendly_bar(total=chart_count, desc=charts_desc)) if chart_count else None
@@ -174,23 +157,19 @@ class ChartTablePDFParser:
                 page_num = p.page_index
                 page_img: Image.Image = pil_pages[page_num - 1]
-                # Only process selected item types
                 target_items = [box for box in p.boxes if box.label in target_labels]
                 if target_items and self.use_vlm:
                     md_lines.append(f"\n## Page {page_num}\n")
                 for box in sorted(target_items, key=reading_order_key):
-                    # Handle charts
                     if box.label == "chart" and self.extract_charts:
                         chart_filename = f"chart_{chart_counter:03d}.png"
                         chart_path = os.path.join(charts_dir, chart_filename)
-                        # Save image
                         cropped_img = page_img.crop((box.x1, box.y1, box.x2, box.y2))
                         cropped_img.save(chart_path)
-                        # Handle VLM processing if enabled
                         if self.use_vlm and self.vlm:
                             rel_path = os.path.join("charts", chart_filename)
                             wrote_table = False
@@ -227,16 +206,13 @@ class ChartTablePDFParser:
                         if charts_bar:
                             charts_bar.update(1)
-                    # Handle tables
                     elif box.label == "table" and self.extract_tables:
                         table_filename = f"table_{table_counter:03d}.png"
                         table_path = os.path.join(tables_dir, table_filename)
-                        # Save image
                         cropped_img = page_img.crop((box.x1, box.y1, box.x2, box.y2))
                         cropped_img.save(table_path)
-                        # Handle VLM processing if enabled
                         if self.use_vlm and self.vlm:
                             rel_path = os.path.join("tables", table_filename)
                             wrote_table = False
@@ -273,19 +249,11 @@ class ChartTablePDFParser:
                         if tables_bar:
                             tables_bar.update(1)
-        # Write outputs only if VLM is used
-        md_path = None
         excel_path = None
         if self.use_vlm:
-            # Write markdown file
-            md_path = os.path.join(out_dir, "charts.md")
-            with open(md_path, 'w', encoding='utf-8') as f:
-                f.write('\n'.join(md_lines))
-            # Write Excel file if we have structured data
             if structured_items:
-                # Determine Excel filename based on extraction target
                 if self.extract_charts and self.extract_tables:
                     excel_filename = "parsed_tables_charts.xlsx"
                 elif self.extract_charts:
@@ -299,23 +267,19 @@ class ChartTablePDFParser:
                 excel_path = os.path.join(out_dir, excel_filename)
                 write_structured_excel(excel_path, structured_items)
-                # Also create HTML version
                 html_filename = excel_filename.replace('.xlsx', '.html')
                 html_path = os.path.join(out_dir, html_filename)
                 write_structured_html(html_path, structured_items)
-            # Write VLM items mapping for UI linkage
             if 'vlm_items' in locals() and vlm_items:
                 with open(os.path.join(out_dir, "vlm_items.json"), 'w', encoding='utf-8') as jf:
                     json.dump(vlm_items, jf, ensure_ascii=False, indent=2)
-        # Print results
         extraction_types = []
         if self.extract_charts:
             extraction_types.append("charts")
         if self.extract_tables:
             extraction_types.append("tables")
-        # Print completion message with output directory
         print(f"✅ Parsing completed successfully!")
         print(f"📁 Output directory: {out_dir}")

doctra/third_party/docres/data/MBD/MBD.py ADDED Viewed

@@ -0,0 +1,110 @@
+import cv2
+import numpy as np
+import MBD_utils
+import torch
+import torch.nn.functional as F
+def mask_base_dewarper(image,mask):
+    '''
+    input:
+        image -> ndarray HxWx3 uint8
+        mask -> ndarray HxW uint8
+    return
+        dewarped -> ndarray HxWx3 uint8
+        grid (optional) -> ndarray HxWx2 -1~1
+    '''
+    ## get contours
+    # _, contours, hierarchy = cv2.findContours(mask,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_NONE)  ## cv2.__version__ == 3.x
+    contours,hierarchy = cv2.findContours(mask,cv2.RETR_EXTERNAL,method=cv2.CHAIN_APPROX_SIMPLE)  ## cv2.__version__ == 4.x
+    ## get biggest contours and four corners based on Douglas-Peucker algorithm
+    four_corners, maxArea, contour= MBD_utils.DP_algorithm(contours)
+    four_corners = MBD_utils.reorder(four_corners)
+    ## reserve biggest contours and remove other noisy contours
+    new_mask = np.zeros_like(mask)
+    new_mask = cv2.drawContours(new_mask,[contour],-1,255,cv2.FILLED)
+    ## obtain middle points
+    # ratios = [0.25,0.5,0.75]  # ratios = [0.125,0.25,0.375,0.5,0.625,0.75,0.875]
+    ratios = [0.25,0.5,0.75]
+    # ratios = [0.0625,0.125,0.1875,0.25,0.3125,0.375,0.4475,0.5,0.5625,0.625,0.06875,0.75,0.8125,0.875,0.9375]
+    middle = MBD_utils.findMiddle(corners=four_corners,mask=new_mask,points=ratios)
+    ## all points
+    source_points = np.concatenate((four_corners,middle),axis=0) ## all_point = four_corners(topleft,topright,bottom)+top+bottom+left+right
+    ## target points
+    h,w = image.shape[:2]
+    padding = 0
+    target_points = [[padding, padding],[w-padding, padding], [padding, h-padding],[w-padding, h-padding]]
+    for ratio in ratios:
+        target_points.append([int((w-2*padding)*ratio)+padding,padding])
+    for ratio in ratios:
+        target_points.append([int((w-2*padding)*ratio)+padding,h-padding])
+    for ratio in ratios:
+        target_points.append([padding,int((h-2*padding)*ratio)+padding])
+    for ratio in ratios:
+        target_points.append([w-padding,int((h-2*padding)*ratio)+padding])
+    ## dewarp base on cv2
+    # pts1 = np.float32(source_points)
+    # pts2 = np.float32(target_points)
+    # tps = cv2.createThinPlateSplineShapeTransformer()
+    # matches = []
+    # N = pts1.shape[0]
+    # for i in range(0,N):
+    #     matches.append(cv2.DMatch(i,i,0))
+    # pts1 = pts1.reshape(1,-1,2)
+    # pts2 = pts2.reshape(1,-1,2)
+    # tps.estimateTransformation(pts2,pts1,matches)
+    # dewarped = tps.warpImage(image)
+    ## dewarp base on generated grid
+    source_points = source_points.reshape(-1,2)/np.array([image.shape[:2][::-1]]).reshape(1,2)
+    source_points = torch.from_numpy(source_points).float().cuda()
+    source_points = source_points.unsqueeze(0)
+    source_points = (source_points-0.5)*2
+    target_points = np.asarray(target_points).reshape(-1,2)/np.array([image.shape[:2][::-1]]).reshape(1,2)
+    target_points = torch.from_numpy(target_points).float()
+    target_points = (target_points-0.5)*2
+    model = MBD_utils.TPSGridGen(target_height=256,target_width=256,target_control_points=target_points)
+    model = model.cuda()
+    grid = model(source_points).view(-1,256,256,2).permute(0,3,1,2)
+    grid = F.interpolate(grid,(h,w),mode='bilinear').permute(0,2,3,1)
+    dewarped = MBD_utils.torch2cvimg(F.grid_sample(MBD_utils.cvimg2torch(image).cuda(),grid))[0]
+    return dewarped,grid[0].cpu().numpy()
+def mask_base_cropper(image,mask):
+    '''
+    input:
+        image -> ndarray HxWx3 uint8
+        mask -> ndarray HxW uint8
+    return
+        dewarped -> ndarray HxWx3 uint8
+        grid (optional) -> ndarray HxWx2 -1~1
+    '''
+    ## get contours
+    _, contours, hierarchy = cv2.findContours(mask,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_NONE)  ## cv2.__version__ == 3.x
+    # contours,hierarchy = cv2.findContours(mask,cv2.RETR_EXTERNAL,method=cv2.CHAIN_APPROX_SIMPLE)  ## cv2.__version__ == 4.x
+    ## get biggest contours and four corners based on Douglas-Peucker algorithm
+    four_corners, maxArea, contour= MBD_utils.DP_algorithm(contours)
+    four_corners = MBD_utils.reorder(four_corners)
+    ## reserve biggest contours and remove other noisy contours
+    new_mask = np.zeros_like(mask)
+    new_mask = cv2.drawContours(new_mask,[contour],-1,255,cv2.FILLED)
+    ## 最小外接矩形
+    rect = cv2.minAreaRect(contour) # 得到最小外接矩形的（中心(x,y), (宽,高), 旋转角度）
+    box = cv2.boxPoints(rect) # cv2.boxPoints(rect) for OpenCV 3.x 获取最小外接矩形的4个顶点坐标
+    box = np.int0(box)
+    box = box.reshape((4,1,2))

doctra/third_party/docres/data/MBD/MBD_utils.py ADDED Viewed

@@ -0,0 +1,291 @@
+import cv2
+import numpy as np
+import copy
+import torch
+import torch
+import itertools
+import torch.nn as nn
+from torch.autograd import Function, Variable
+def reorder(myPoints):
+    myPoints = myPoints.reshape((4, 2))
+    myPointsNew = np.zeros((4, 1, 2), dtype=np.int32)
+    add = myPoints.sum(1)
+    myPointsNew[0] = myPoints[np.argmin(add)]
+    myPointsNew[3] =myPoints[np.argmax(add)]
+    diff = np.diff(myPoints, axis=1)
+    myPointsNew[1] =myPoints[np.argmin(diff)]
+    myPointsNew[2] = myPoints[np.argmax(diff)]
+    return myPointsNew
+def findMiddle(corners,mask,points=[0.25,0.5,0.75]):
+    num_middle_points = len(points)
+    top = [np.array([])]*num_middle_points
+    bottom = [np.array([])]*num_middle_points
+    left = [np.array([])]*num_middle_points
+    right = [np.array([])]*num_middle_points
+    center_top = []
+    center_bottom = []
+    center_left = []
+    center_right = []
+    center = (int((corners[0][0][1]+corners[3][0][1])/2),int((corners[0][0][0]+corners[3][0][0])/2))
+    for ratio in points:
+        center_top.append( (center[0],int(corners[0][0][0]*(1-ratio)+corners[1][0][0]*ratio)) )
+        center_bottom.append( (center[0],int(corners[2][0][0]*(1-ratio)+corners[3][0][0]*ratio)) )
+        center_left.append( (int(corners[0][0][1]*(1-ratio)+corners[2][0][1]*ratio),center[1]) )
+        center_right.append( (int(corners[1][0][1]*(1-ratio)+corners[3][0][1]*ratio),center[1]) )
+    for i in range(0,center[0],1):
+        for j in range(num_middle_points):
+            if top[j].size==0:
+                if mask[i,center_top[j][1]]==255:
+                    top[j] = np.asarray([center_top[j][1],i])
+                    top[j] = top[j].reshape(1,2)
+    for i in range(mask.shape[0]-1,center[0],-1):
+        for j in range(num_middle_points):
+            if bottom[j].size==0:
+                if mask[i,center_bottom[j][1]]==255:
+                    bottom[j] = np.asarray([center_bottom[j][1],i])
+                    bottom[j] = bottom[j].reshape(1,2)
+    for i in range(mask.shape[1]-1,center[1],-1):
+        for j in range(num_middle_points):
+            if right[j].size==0:
+                if mask[center_right[j][0],i]==255:
+                    right[j] = np.asarray([i,center_right[j][0]])
+                    right[j] = right[j].reshape(1,2)
+    for i in range(0,center[1]):
+        for j in range(num_middle_points):
+            if left[j].size==0:
+                if mask[center_left[j][0],i]==255:
+                    left[j] = np.asarray([i,center_left[j][0]])
+                    left[j] = left[j].reshape(1,2)
+    return np.asarray(top+bottom+left+right)
+def DP_algorithmv1(contours):
+    biggest = np.array([])
+    max_area = 0
+    step = 0.001
+    count = 0
+    # while biggest.size==0:
+    while True:
+        for i in contours:
+            # print(i.shape)
+            area = cv2.contourArea(i)
+            # print(area,cv2.arcLength(i, True))
+            if area > cv2.arcLength(i, True)*10:
+                peri = cv2.arcLength(i, True)
+                approx = cv2.approxPolyDP(i, (0.01+step*count) * peri, True)
+                if area > max_area and len(approx) == 4:
+                    max_area = area
+                    biggest_contours = i
+                    biggest = approx
+                    break
+                    if abs(max_area - cv2.contourArea(biggest))/max_area > 0.3:
+                        biggest = np.array([])
+        count += 1
+        if count > 200:
+            break
+    temp = biggest[0]
+    return biggest,max_area, biggest_contours
+def DP_algorithm(contours):
+    biggest = np.array([])
+    max_area = 0
+    step = 0.001
+    count = 0
+    ### largest contours
+    for i in contours:
+        area = cv2.contourArea(i)
+        if area > max_area:
+            max_area = area
+            biggest_contours = i
+    peri = cv2.arcLength(biggest_contours, True)
+    ### find four corners
+    while True:
+        approx = cv2.approxPolyDP(biggest_contours, (0.01+step*count) * peri, True)
+        if len(approx) == 4:
+            biggest = approx
+            break
+            # if abs(max_area - cv2.contourArea(biggest))/max_area > 0.2:
+            # if abs(max_area - cv2.contourArea(biggest))/max_area > 0.4:
+                # biggest = np.array([])
+        count += 1
+        if count > 200:
+            break
+    return biggest,max_area, biggest_contours
+def drawRectangle(img,biggest,color,thickness):
+    cv2.line(img, (biggest[0][0][0], biggest[0][0][1]), (biggest[1][0][0], biggest[1][0][1]), color, thickness)
+    cv2.line(img, (biggest[0][0][0], biggest[0][0][1]), (biggest[2][0][0], biggest[2][0][1]), color, thickness)
+    cv2.line(img, (biggest[3][0][0], biggest[3][0][1]), (biggest[2][0][0], biggest[2][0][1]), color, thickness)
+    cv2.line(img, (biggest[3][0][0], biggest[3][0][1]), (biggest[1][0][0], biggest[1][0][1]), color, thickness)
+    return img
+def minAreaRect(contours,img):
+    # biggest = np.array([])
+    max_area = 0
+    for i in contours:
+        area = cv2.contourArea(i)
+        if area > max_area:
+            peri = cv2.arcLength(i, True)
+            rect = cv2.minAreaRect(i)
+            points = cv2.boxPoints(rect)
+            max_area = area
+    return points
+def cropRectangle(img,biggest):
+    # print(biggest)
+    w = np.abs(biggest[0][0][0] - biggest[1][0][0])
+    h = np.abs(biggest[0][0][1] - biggest[2][0][1])
+    new_img = np.zeros((w,h,img.shape[-1]),dtype=np.uint8)
+    new_img = img[biggest[0][0][1]:biggest[0][0][1]+h,biggest[0][0][0]:biggest[0][0][0]+w]
+    return new_img
+def cvimg2torch(img,min=0,max=1):
+    '''
+    input:
+        im -> ndarray uint8 HxWxC
+    return
+        tensor -> torch.tensor BxCxHxW
+    '''
+    if len(img.shape)==2:
+        img = np.expand_dims(img,axis=-1)
+    img = img.astype(float) / 255.0
+    img = img.transpose(2, 0, 1) # NHWC -> NCHW
+    img = np.expand_dims(img, 0)
+    img = torch.from_numpy(img).float()
+    return img
+def torch2cvimg(tensor,min=0,max=1):
+    '''
+    input:
+        tensor -> torch.tensor BxCxHxW C can be 1,3
+    return
+        im -> ndarray uint8 HxWxC
+    '''
+    im_list = []
+    for i in range(tensor.shape[0]):
+        im = tensor.detach().cpu().data.numpy()[i]
+        im = im.transpose(1,2,0)
+        im = np.clip(im,min,max)
+        im = ((im-min)/(max-min)*255).astype(np.uint8)
+        im_list.append(im)
+    return im_list
+class TPSGridGen(nn.Module):
+    def __init__(self, target_height, target_width, target_control_points):
+        '''
+        target_control_points -> torch.tensor  num_pointx2 -1~1
+        source_control_points -> torch.tensor  batch_size x num_point x 2 -1~1
+        return:
+            grid -> batch_size x hw x 2 -1~1
+        '''
+        super(TPSGridGen, self).__init__()
+        assert target_control_points.ndimension() == 2
+        assert target_control_points.size(1) == 2
+        N = target_control_points.size(0)
+        self.num_points = N
+        target_control_points = target_control_points.float()
+        # create padded kernel matrix
+        forward_kernel = torch.zeros(N + 3, N + 3)
+        target_control_partial_repr = self.compute_partial_repr(target_control_points, target_control_points)
+        forward_kernel[:N, :N].copy_(target_control_partial_repr)
+        forward_kernel[:N, -3].fill_(1)
+        forward_kernel[-3, :N].fill_(1)
+        forward_kernel[:N, -2:].copy_(target_control_points)
+        forward_kernel[-2:, :N].copy_(target_control_points.transpose(0, 1))
+        # compute inverse matrix
+        inverse_kernel = torch.inverse(forward_kernel)
+        # create target cordinate matrix
+        HW = target_height * target_width
+        target_coordinate = list(itertools.product(range(target_height), range(target_width)))
+        target_coordinate = torch.Tensor(target_coordinate) # HW x 2
+        Y, X = target_coordinate.split(1, dim = 1)
+        Y = Y * 2 / (target_height - 1) - 1
+        X = X * 2 / (target_width - 1) - 1
+        target_coordinate = torch.cat([X, Y], dim = 1) # convert from (y, x) to (x, y)
+        target_coordinate_partial_repr = self.compute_partial_repr(target_coordinate.to(target_control_points.device), target_control_points)
+        target_coordinate_repr = torch.cat([
+            target_coordinate_partial_repr, torch.ones(HW, 1), target_coordinate
+        ], dim = 1)
+        # register precomputed matrices
+        self.register_buffer('inverse_kernel', inverse_kernel)
+        self.register_buffer('padding_matrix', torch.zeros(3, 2))
+        self.register_buffer('target_coordinate_repr', target_coordinate_repr)
+    def forward(self, source_control_points):
+        assert source_control_points.ndimension() == 3
+        assert source_control_points.size(1) == self.num_points
+        assert source_control_points.size(2) == 2
+        batch_size = source_control_points.size(0)
+        Y = torch.cat([source_control_points, Variable(self.padding_matrix.expand(batch_size, 3, 2))], 1)
+        mapping_matrix = torch.matmul(Variable(self.inverse_kernel), Y)
+        source_coordinate = torch.matmul(Variable(self.target_coordinate_repr), mapping_matrix)
+        return source_coordinate
+    # phi(x1, x2) = r^2 * log(r), where r = ||x1 - x2||_2
+    def compute_partial_repr(self, input_points, control_points):
+        N = input_points.size(0)
+        M = control_points.size(0)
+        pairwise_diff = input_points.view(N, 1, 2) - control_points.view(1, M, 2)
+        # original implementation, very slow
+        # pairwise_dist = torch.sum(pairwise_diff ** 2, dim = 2) # square of distance
+        pairwise_diff_square = pairwise_diff * pairwise_diff
+        pairwise_dist = pairwise_diff_square[:, :, 0] + pairwise_diff_square[:, :, 1]
+        repr_matrix = 0.5 * pairwise_dist * torch.log(pairwise_dist)
+        # fix numerical error for 0 * log(0), substitute all nan with 0
+        mask = repr_matrix != repr_matrix
+        repr_matrix.masked_fill_(mask, 0)
+        return repr_matrix
+    ### deside wheather further process
+    # point_area = cv2.contourArea(np.concatenate((biggest_angle[0].reshape(1,1,2),middle[0:3],biggest_angle[1].reshape(1,1,2),middle[9:12],biggest_angle[3].reshape(1,1,2),middle[3:6][::-1],biggest_angle[2].reshape(1,1,2),middle[6:9][::-1]),axis=0))
+    #### 最小外接矩形
+    # rect = cv2.minAreaRect(contour) # 得到最小外接矩形的（中心(x,y), (宽,高), 旋转角度）
+    # box = cv2.boxPoints(rect) # cv2.boxPoints(rect) for OpenCV 3.x 获取最小外接矩形的4个顶点坐标
+    # box = np.int0(box)
+    # box = box.reshape((4,1,2))
+    # minrect_area = cv2.contourArea(box)
+    # print(abs(minrect_area-point_area)/point_area)
+    #### 四个角点 IOU
+    # biggest_box = np.concatenate((biggest_angle[0,:,:].reshape(1,1,2),biggest_angle[2,:,:].reshape(1,1,2),biggest_angle[3,:,:].reshape(1,1,2),biggest_angle[1,:,:].reshape(1,1,2)),axis=0)
+    # biggest_mask = np.zeros_like(mask)
+    # # corner_area = cv2.contourArea(biggest_box)
+    # cv2.drawContours(biggest_mask,[biggest_box], -1, color=255, thickness=-1)
+    # smooth = 1e-5
+    # biggest_mask_ = biggest_mask > 50
+    # mask_ = mask > 50
+    # intersection = (biggest_mask_ & mask_).sum()
+    # union = (biggest_mask_ | mask_).sum()
+    # iou = (intersection + smooth) / (union + smooth)
+    # if iou > 0.975:
+    #     skip = True
+    # else:
+    #     skip = False
+    # print(iou)
+    # cv2.imshow('mask',cv2.resize(mask,(512,512)))
+    # cv2.imshow('biggest_mask',cv2.resize(biggest_mask,(512,512)))
+    # cv2.waitKey(0)

doctra 0.3.2__py3-none-any.whl → 0.4.0__py3-none-any.whl

doctra 0.3.2py3-none-any.whl → 0.4.0py3-none-any.whl