pdf-extract 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/assign.rb +72 -0
- data/bin/config.json +4 -0
- data/bin/fac_v19n11_s5.mask.pdf +0 -0
- data/bin/margins.mask.pdf +0 -0
- data/bin/one-column.mask.pdf +24110 -39
- data/bin/pdf-extract +146 -0
- data/bin/s002040050107_Arch_Toxicol_1994_68_8.mask.pdf +0 -0
- data/bin/some3.mask.pdf +0 -0
- data/bin/some5.mask.pdf +0 -0
- data/bin/some6.mask.pdf +0 -0
- data/bin/train.rb +48 -0
- data/bin/two-column.mask.pdf +0 -0
- data/data/familynames.db +0 -0
- data/data/stopwords.txt +1 -0
- data/lib/analysis/columns.rb +75 -0
- data/lib/analysis/margins.rb +84 -0
- data/lib/analysis/sections.rb +156 -0
- data/lib/analysis/titles.rb +53 -0
- data/lib/analysis/zones.rb +128 -0
- data/lib/font_metrics.rb +240 -0
- data/lib/kmeans.rb +114 -0
- data/lib/language.rb +58 -0
- data/lib/model/characters.rb +320 -0
- data/lib/model/chunks.rb +103 -0
- data/lib/model/regions.rb +112 -0
- data/lib/multi_range.rb +69 -0
- data/lib/names.rb +85 -0
- data/lib/pdf-extract.rb +77 -0
- data/lib/pdf.rb +255 -0
- data/lib/references/references.rb +184 -0
- data/lib/references/resolve.rb +113 -0
- data/lib/references/resolved_references.rb +37 -0
- data/lib/spatial.rb +188 -0
- data/lib/view/abstract_view.rb +32 -0
- data/lib/view/pdf_view.rb +43 -0
- data/lib/view/png_view.rb +30 -0
- data/lib/view/xml_view.rb +113 -0
- metadata +208 -0
| @@ -0,0 +1,320 @@ | |
| 1 | 
            +
            require 'matrix'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            require_relative '../font_metrics'
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            module PdfExtract
         | 
| 6 | 
            +
              module Characters
         | 
| 7 | 
            +
             | 
| 8 | 
            +
                # TODO Implement writing mode and :FontMatrix.
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                def self.glyph_descent c, state
         | 
| 11 | 
            +
                  if state.last[:font_metrics].nil? || state.last[:font_metrics].descent.nil?
         | 
| 12 | 
            +
                    0
         | 
| 13 | 
            +
                  else
         | 
| 14 | 
            +
                    state.last[:font_metrics].descent / 1000.0
         | 
| 15 | 
            +
                  end
         | 
| 16 | 
            +
                end
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                def self.glyph_ascent c, state
         | 
| 19 | 
            +
                  if state.last[:font_metrics].nil? || state.last[:font_metrics].ascent.nil?
         | 
| 20 | 
            +
                    0
         | 
| 21 | 
            +
                  else
         | 
| 22 | 
            +
                    state.last[:font_metrics].ascent / 1000.0
         | 
| 23 | 
            +
                  end
         | 
| 24 | 
            +
                end
         | 
| 25 | 
            +
                
         | 
| 26 | 
            +
                def self.glyph_width c, state
         | 
| 27 | 
            +
                  # :Widths may be used to determine glyph width. This is the same as
         | 
| 28 | 
            +
                  # horizontal displacemnt.
         | 
| 29 | 
            +
                  glyph_displacement(c, state)[0]
         | 
| 30 | 
            +
                end
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                def self.glyph_height c, state
         | 
| 33 | 
            +
                  # :Ascent and :Descent from the :FontDescriptor can be used to determine
         | 
| 34 | 
            +
                  # maximum glyph height.
         | 
| 35 | 
            +
                  glyph_ascent(c, state) - glyph_descent(c, state)
         | 
| 36 | 
            +
                end
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                def self.glyph_displacement c, state
         | 
| 39 | 
            +
                  # For non-Type3 fonts, vertical displacement is the glyph width,
         | 
| 40 | 
            +
                  # horizontal displacement is always 0. Note glyph width is given
         | 
| 41 | 
            +
                  # in 1000ths of text units.
         | 
| 42 | 
            +
                  if state.last[:font_metrics].nil?
         | 
| 43 | 
            +
                    # XXX Why are some font resources not reported via resource_font?
         | 
| 44 | 
            +
                    # Bug in pdf-reader? Possibly because of :Font entry in graphics
         | 
| 45 | 
            +
                    # state set.
         | 
| 46 | 
            +
                    [ 0, 0 ]
         | 
| 47 | 
            +
                  else
         | 
| 48 | 
            +
                    [ state.last[:font_metrics].glyph_width(c) / 1000.0, 0 ]
         | 
| 49 | 
            +
                  end
         | 
| 50 | 
            +
                end
         | 
| 51 | 
            +
             | 
| 52 | 
            +
                def self.make_text_runs text, tj, state, render_state, page, page_number
         | 
| 53 | 
            +
                  # TODO Ignore chars outside the page :MediaBox.
         | 
| 54 | 
            +
                  # TODO Mul UserUnit if specified by page.
         | 
| 55 | 
            +
                  # TODO Include writing mode, so that runs can be joined either
         | 
| 56 | 
            +
                  #      virtically or horizontally in the join stage.
         | 
| 57 | 
            +
                  
         | 
| 58 | 
            +
                  objs = []
         | 
| 59 | 
            +
                  h_scale_mod = (state.last[:h_scale] / 100.0)
         | 
| 60 | 
            +
                  s = state.last
         | 
| 61 | 
            +
                  
         | 
| 62 | 
            +
                  disp_x, disp_y = [0, 0]
         | 
| 63 | 
            +
                  spacing = 0
         | 
| 64 | 
            +
                  tx = ((disp_x - (tj / 1000.0)) * s[:font_size] + spacing) * h_scale_mod
         | 
| 65 | 
            +
                  ty = (disp_y - (tj / 1000.0)) * s[:font_size] + spacing
         | 
| 66 | 
            +
             | 
| 67 | 
            +
                  # TODO Should use either tx or ty depending on writing mode.
         | 
| 68 | 
            +
                  render_state[:tm] = Matrix[ [1, 0, 0], [0, 1, 0], [tx, 0, 1] ] * render_state[:tm]
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                  # tj applies only to the first char of the Tj op.
         | 
| 71 | 
            +
                  tj = 0
         | 
| 72 | 
            +
                  
         | 
| 73 | 
            +
                  text.each_char do |c|
         | 
| 74 | 
            +
                    trm = Matrix[ [s[:font_size] * h_scale_mod, 0, 0],
         | 
| 75 | 
            +
                                  [0, s[:font_size], 0],
         | 
| 76 | 
            +
                                  [0, s[:rise], 1] ]
         | 
| 77 | 
            +
                    trm = trm * render_state[:tm] * state.last[:ctm]
         | 
| 78 | 
            +
             | 
| 79 | 
            +
                    bl_pos = Matrix.rows( [ [0, glyph_descent(c, state), 1] ])
         | 
| 80 | 
            +
                    bl_pos = bl_pos * trm
         | 
| 81 | 
            +
             | 
| 82 | 
            +
                    width = glyph_width(c, state)
         | 
| 83 | 
            +
                    height = glyph_descent(c, state) + glyph_height(c, state)
         | 
| 84 | 
            +
             | 
| 85 | 
            +
                    tr_pos = Matrix.rows([ [width, height, 1] ])
         | 
| 86 | 
            +
                    tr_pos = tr_pos * trm
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                    px = bl_pos.row(0)[0]
         | 
| 89 | 
            +
                    py = bl_pos.row(0)[1]
         | 
| 90 | 
            +
                    
         | 
| 91 | 
            +
                    objs << {
         | 
| 92 | 
            +
                      :x => px,
         | 
| 93 | 
            +
                      :y => py,
         | 
| 94 | 
            +
                      :width => tr_pos.row(0)[0] - px,
         | 
| 95 | 
            +
                      :height => tr_pos.row(0)[1] - py,
         | 
| 96 | 
            +
                      :line_height => tr_pos.row(0)[1] - py,
         | 
| 97 | 
            +
                      :content => state.last[:font].to_utf8(c),
         | 
| 98 | 
            +
                      :page => page_number,
         | 
| 99 | 
            +
                      :font => state.last[:font].basefont,
         | 
| 100 | 
            +
                      :page_width => page[:MediaBox][2] - page[:MediaBox][0],
         | 
| 101 | 
            +
                      :page_height => page[:MediaBox][3] - page[:MediaBox][1]
         | 
| 102 | 
            +
                    }
         | 
| 103 | 
            +
                    
         | 
| 104 | 
            +
                    disp_x, disp_y = glyph_displacement(c, state)
         | 
| 105 | 
            +
                    spacing = s[:char_spacing] if c != ' '
         | 
| 106 | 
            +
                    spacing = s[:word_spacing] if c == ' '
         | 
| 107 | 
            +
                    tx = ((disp_x - (tj / 1000.0)) * s[:font_size] + spacing) * h_scale_mod
         | 
| 108 | 
            +
                    ty = (disp_y - (tj / 1000.0)) * s[:font_size] + spacing
         | 
| 109 | 
            +
             | 
| 110 | 
            +
                    # TODO Should use either tx or ty depending on writing mode.
         | 
| 111 | 
            +
                    render_state[:tm] = Matrix[ [1, 0, 0], [0, 1, 0], [tx, 0, 1] ] * render_state[:tm]
         | 
| 112 | 
            +
                  end
         | 
| 113 | 
            +
                  
         | 
| 114 | 
            +
                  objs
         | 
| 115 | 
            +
                end
         | 
| 116 | 
            +
             | 
| 117 | 
            +
                def self.include_in pdf
         | 
| 118 | 
            +
             | 
| 119 | 
            +
                  pdf.spatials :characters do |parser|
         | 
| 120 | 
            +
                    state = []
         | 
| 121 | 
            +
                    page = nil
         | 
| 122 | 
            +
                    fonts = {}
         | 
| 123 | 
            +
                    font_metrics = {}
         | 
| 124 | 
            +
                    page_n = 0
         | 
| 125 | 
            +
                    render_state = {
         | 
| 126 | 
            +
                      :tm => Matrix.identity(3),
         | 
| 127 | 
            +
                      :tlm => Matrix.identity(3)
         | 
| 128 | 
            +
                    }
         | 
| 129 | 
            +
             | 
| 130 | 
            +
                    parser.for :resource_font do |data|
         | 
| 131 | 
            +
                      fonts[data[0]] = data[1]
         | 
| 132 | 
            +
                      font_metrics[data[0]] = FontMetrics.new data[1]
         | 
| 133 | 
            +
                      nil
         | 
| 134 | 
            +
                    end
         | 
| 135 | 
            +
             | 
| 136 | 
            +
                    parser.for :begin_page do |data|
         | 
| 137 | 
            +
                      page = data[0]
         | 
| 138 | 
            +
                      page_n = page_n.next
         | 
| 139 | 
            +
                      state << {
         | 
| 140 | 
            +
                        :h_scale => 100,
         | 
| 141 | 
            +
                        :char_spacing => 0,
         | 
| 142 | 
            +
                        :word_spacing => 0,
         | 
| 143 | 
            +
                        :leading => 0,
         | 
| 144 | 
            +
                        :rise => 0,
         | 
| 145 | 
            +
                        :font => nil,
         | 
| 146 | 
            +
                        :font_metrics => nil,
         | 
| 147 | 
            +
                        :font_size => 0,
         | 
| 148 | 
            +
                        :ctm => Matrix.identity(3)
         | 
| 149 | 
            +
                      }
         | 
| 150 | 
            +
                      nil
         | 
| 151 | 
            +
                    end
         | 
| 152 | 
            +
             | 
| 153 | 
            +
                    parser.for :end_page do |data|
         | 
| 154 | 
            +
                      state.pop
         | 
| 155 | 
            +
                      nil
         | 
| 156 | 
            +
                    end
         | 
| 157 | 
            +
             | 
| 158 | 
            +
                    parser.for :begin_text_object do |data|
         | 
| 159 | 
            +
                      render_state = {
         | 
| 160 | 
            +
                        :tm => Matrix.identity(3),
         | 
| 161 | 
            +
                        :tlm => Matrix.identity(3)
         | 
| 162 | 
            +
                      }
         | 
| 163 | 
            +
                      nil
         | 
| 164 | 
            +
                    end
         | 
| 165 | 
            +
             | 
| 166 | 
            +
                    # Graphics state operators.
         | 
| 167 | 
            +
             | 
| 168 | 
            +
                    parser.for :set_graphics_state_parameters do |data|
         | 
| 169 | 
            +
                      # TODO Handle gs graphics state dictionary set operation for
         | 
| 170 | 
            +
                      # :Font dictionary entries. Probably why font is sometimes nil.
         | 
| 171 | 
            +
                      # puts data
         | 
| 172 | 
            +
                      nil
         | 
| 173 | 
            +
                    end
         | 
| 174 | 
            +
             | 
| 175 | 
            +
                    parser.for :save_graphics_state do |data|
         | 
| 176 | 
            +
                      state.push state.last.dup
         | 
| 177 | 
            +
                      nil
         | 
| 178 | 
            +
                    end
         | 
| 179 | 
            +
             | 
| 180 | 
            +
                    parser.for :restore_graphics_state do |data|
         | 
| 181 | 
            +
                      state.pop
         | 
| 182 | 
            +
                      nil
         | 
| 183 | 
            +
                    end
         | 
| 184 | 
            +
             | 
| 185 | 
            +
                    parser.for :concatenate_matrix do |data|
         | 
| 186 | 
            +
                      a, b, c, d, e, f = data
         | 
| 187 | 
            +
                      ctm = state.last[:ctm]
         | 
| 188 | 
            +
                      state.last[:ctm] = Matrix[ [a, b, 0], [c, d, 0], [e, f, 1] ] * ctm
         | 
| 189 | 
            +
                      nil
         | 
| 190 | 
            +
                    end
         | 
| 191 | 
            +
             | 
| 192 | 
            +
                    # State change operators.
         | 
| 193 | 
            +
             | 
| 194 | 
            +
                    parser.for :set_text_leading do |data|
         | 
| 195 | 
            +
                      state.last[:leading] = data.first
         | 
| 196 | 
            +
                      nil
         | 
| 197 | 
            +
                    end
         | 
| 198 | 
            +
             | 
| 199 | 
            +
                    parser.for :set_text_rise do |data|
         | 
| 200 | 
            +
                      state.last[:rise] = data.first
         | 
| 201 | 
            +
                      nil
         | 
| 202 | 
            +
                    end
         | 
| 203 | 
            +
             | 
| 204 | 
            +
                    parser.for :set_character_spacing do |data|
         | 
| 205 | 
            +
                      state.last[:char_spacing] = data.first
         | 
| 206 | 
            +
                      nil
         | 
| 207 | 
            +
                    end
         | 
| 208 | 
            +
             | 
| 209 | 
            +
                    parser.for :set_word_spacing do |data|
         | 
| 210 | 
            +
                      state.last[:word_spacing] = data.first
         | 
| 211 | 
            +
                      nil
         | 
| 212 | 
            +
                    end
         | 
| 213 | 
            +
             | 
| 214 | 
            +
                    parser.for :set_horizontal_text_scaling do |data|
         | 
| 215 | 
            +
                      state.last[:h_scale] = data.first
         | 
| 216 | 
            +
                      nil
         | 
| 217 | 
            +
                    end
         | 
| 218 | 
            +
             | 
| 219 | 
            +
                    # Position change operators.
         | 
| 220 | 
            +
             | 
| 221 | 
            +
                    parser.for :move_text_position do |data|
         | 
| 222 | 
            +
                      render_state[:tm] = Matrix[
         | 
| 223 | 
            +
                        [1, 0, 0], [0, 1, 0], [data[0], data[1], 1]
         | 
| 224 | 
            +
                      ] * render_state[:tlm]
         | 
| 225 | 
            +
                      render_state[:tlm] = render_state[:tm]
         | 
| 226 | 
            +
                      nil
         | 
| 227 | 
            +
                    end
         | 
| 228 | 
            +
             | 
| 229 | 
            +
                    parser.for :move_text_position_and_set_leading do |data|
         | 
| 230 | 
            +
                      state.last[:leading] = -data[1]
         | 
| 231 | 
            +
                      render_state[:tm] = Matrix[
         | 
| 232 | 
            +
                        [1, 0, 0], [0, 1, 0], [data[0], data[1], 1]
         | 
| 233 | 
            +
                      ] * render_state[:tlm]
         | 
| 234 | 
            +
                      render_state[:tlm] = render_state[:tm]
         | 
| 235 | 
            +
                      nil
         | 
| 236 | 
            +
                    end
         | 
| 237 | 
            +
             | 
| 238 | 
            +
                    # Font change operators.
         | 
| 239 | 
            +
             | 
| 240 | 
            +
                    parser.for :set_text_font_and_size do |data|
         | 
| 241 | 
            +
                      state.last[:font] = fonts[data[0]]
         | 
| 242 | 
            +
                      state.last[:font_metrics] = font_metrics[data[0]]
         | 
| 243 | 
            +
                      state.last[:font_size] = data[1]
         | 
| 244 | 
            +
                      nil
         | 
| 245 | 
            +
                    end
         | 
| 246 | 
            +
             | 
| 247 | 
            +
                    # Text matrix change operators.
         | 
| 248 | 
            +
             | 
| 249 | 
            +
                    parser.for :set_text_matrix_and_text_line_matrix do |data|
         | 
| 250 | 
            +
                      # --     --
         | 
| 251 | 
            +
                      # | a b 0 |
         | 
| 252 | 
            +
                      # | c d 0 |
         | 
| 253 | 
            +
                      # | e f 1 |
         | 
| 254 | 
            +
                      # --     --
         | 
| 255 | 
            +
                      a, b, c, d, e, f = data
         | 
| 256 | 
            +
                      render_state[:tm] = Matrix[ [a, b, 0], [c, d, 0], [e, f, 1] ]
         | 
| 257 | 
            +
                      render_state[:tlm] = Matrix[ [a, b, 0], [c, d, 0], [e, f, 1] ]
         | 
| 258 | 
            +
                      nil
         | 
| 259 | 
            +
                    end
         | 
| 260 | 
            +
             | 
| 261 | 
            +
                    # New line operators.
         | 
| 262 | 
            +
             | 
| 263 | 
            +
                    parser.for :move_to_start_of_next_line do |data|
         | 
| 264 | 
            +
                      render_state[:tm] = Matrix[
         | 
| 265 | 
            +
                        [1, 0, 0], [0, 1, 0], [0, -state.last[:leading], 1]
         | 
| 266 | 
            +
                      ] * render_state[:tlm]
         | 
| 267 | 
            +
                      render_state[:tlm] = render_state[:tm]
         | 
| 268 | 
            +
                      nil
         | 
| 269 | 
            +
                    end
         | 
| 270 | 
            +
             | 
| 271 | 
            +
                    # Show text operators.
         | 
| 272 | 
            +
             | 
| 273 | 
            +
                    parser.for :set_spacing_next_line_show_text_raw do |data|
         | 
| 274 | 
            +
                      state.last[:word_spacing] = data[0]
         | 
| 275 | 
            +
                      state.last[:char_spacing] = data[1]
         | 
| 276 | 
            +
                      
         | 
| 277 | 
            +
                      render_state[:tm] = Matrix[
         | 
| 278 | 
            +
                        [1, 0, 0], [0, 1, 0], [0, -state.last[:leading], 1]
         | 
| 279 | 
            +
                      ] * render_state[:tlm]
         | 
| 280 | 
            +
                      render_state[:tlm] = render_state[:tm]
         | 
| 281 | 
            +
             | 
| 282 | 
            +
                      make_text_runs data[2], 0, state, render_state, page, page_n
         | 
| 283 | 
            +
                    end
         | 
| 284 | 
            +
             | 
| 285 | 
            +
                    parser.for :move_to_next_line_and_show_text_raw do |data|
         | 
| 286 | 
            +
                      render_state[:tm] = Matrix[
         | 
| 287 | 
            +
                        [1, 0, 0], [0, 1, 0], [0, -state.last[:leading], 1]
         | 
| 288 | 
            +
                      ] * render_state[:tlm]
         | 
| 289 | 
            +
                      render_state[:tlm] = render_state[:tm]
         | 
| 290 | 
            +
                      
         | 
| 291 | 
            +
                      make_text_runs data.first, 0, state, render_state, page, page_n
         | 
| 292 | 
            +
                    end
         | 
| 293 | 
            +
             | 
| 294 | 
            +
                    parser.for :show_text_raw do |data|
         | 
| 295 | 
            +
                      make_text_runs data.first, 0, state, render_state, page, page_n
         | 
| 296 | 
            +
                    end
         | 
| 297 | 
            +
                    
         | 
| 298 | 
            +
                    parser.for :show_text_with_positioning_raw do |data|
         | 
| 299 | 
            +
                      data = data.first
         | 
| 300 | 
            +
                      runs = []
         | 
| 301 | 
            +
                      tj = 0
         | 
| 302 | 
            +
                      
         | 
| 303 | 
            +
                      data.each do |item|
         | 
| 304 | 
            +
                        case item.class.to_s
         | 
| 305 | 
            +
                        when "Fixnum", "Float"
         | 
| 306 | 
            +
                          tj = item
         | 
| 307 | 
            +
                        when "String"
         | 
| 308 | 
            +
                          runs << make_text_runs(item, tj, state, render_state, page, page_n)
         | 
| 309 | 
            +
                          tj = 0
         | 
| 310 | 
            +
                        end
         | 
| 311 | 
            +
                      end
         | 
| 312 | 
            +
                      
         | 
| 313 | 
            +
                      runs.flatten
         | 
| 314 | 
            +
                    end
         | 
| 315 | 
            +
                    
         | 
| 316 | 
            +
                  end
         | 
| 317 | 
            +
                end
         | 
| 318 | 
            +
             | 
| 319 | 
            +
              end
         | 
| 320 | 
            +
            end
         | 
    
        data/lib/model/chunks.rb
    ADDED
    
    | @@ -0,0 +1,103 @@ | |
| 1 | 
            +
            require_relative '../spatial'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module PdfExtract
         | 
| 4 | 
            +
              module Chunks
         | 
| 5 | 
            +
             | 
| 6 | 
            +
                # TODO Look for obj[:writing_mode] == :vertical or :horizontal
         | 
| 7 | 
            +
             | 
| 8 | 
            +
                Settings.default :char_slop, 0.2
         | 
| 9 | 
            +
                Settings.default :word_slop, 4.0
         | 
| 10 | 
            +
                Settings.default :overlap_slop, 0.9
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                def self.include_in pdf
         | 
| 13 | 
            +
                  char_slop = 0.2
         | 
| 14 | 
            +
                  word_slop = 4.0
         | 
| 15 | 
            +
                  overlap_slop = 0.9
         | 
| 16 | 
            +
                  
         | 
| 17 | 
            +
                  pdf.spatials :chunks, :paged => true, :depends_on => [:characters] do |parser|
         | 
| 18 | 
            +
                    rows = {}
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                    parser.before do
         | 
| 21 | 
            +
                      rows = {}
         | 
| 22 | 
            +
                    end
         | 
| 23 | 
            +
                    
         | 
| 24 | 
            +
                    parser.objects :characters do |chars|
         | 
| 25 | 
            +
                      y = chars[:y]
         | 
| 26 | 
            +
                      rows[y] = [] if rows[y].nil?
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                      idx = rows[y].index { |obj| chars[:x] <= obj[:x] }
         | 
| 29 | 
            +
                      if idx.nil?
         | 
| 30 | 
            +
                        rows[y] << chars.dup
         | 
| 31 | 
            +
                      else
         | 
| 32 | 
            +
                        rows[y].insert idx, chars.dup
         | 
| 33 | 
            +
                      end
         | 
| 34 | 
            +
                    end
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                    parser.after do
         | 
| 37 | 
            +
                      char_slop = pdf.settings[:char_slop]
         | 
| 38 | 
            +
                      word_slop = pdf.settings[:word_slop]
         | 
| 39 | 
            +
                      overlap_slop = pdf.settings[:overlap_slop]
         | 
| 40 | 
            +
                      
         | 
| 41 | 
            +
                      text_chunks = []
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                      rows.each_pair do |y, row|
         | 
| 44 | 
            +
                        char_width = row.first[:width]
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                        while row.length > 1
         | 
| 47 | 
            +
                          left = row.first
         | 
| 48 | 
            +
                          right = row[1]
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                          if (left[:x] + left[:width] + (char_width * char_slop)) >= right[:x]
         | 
| 51 | 
            +
                            # join as adjacent chars
         | 
| 52 | 
            +
                            row[0] = Spatial.merge left, right
         | 
| 53 | 
            +
                            row.delete_at 1
         | 
| 54 | 
            +
                            char_width = right[:width] unless right[:content].strip =~ /[^A-Za-z0-9]/
         | 
| 55 | 
            +
                          elsif (left[:x] + left[:width] + (char_width * word_slop)) >= right[:x]
         | 
| 56 | 
            +
                            # join with a ' ' in the middle.
         | 
| 57 | 
            +
                            row[0] = Spatial.merge left, right, :separator => ' '
         | 
| 58 | 
            +
                            row.delete_at 1
         | 
| 59 | 
            +
                            char_width = right[:width] unless right[:content].strip =~ /[^A-Za-z0-9]/
         | 
| 60 | 
            +
                          else
         | 
| 61 | 
            +
                            # leave 'em be.
         | 
| 62 | 
            +
                            text_chunks << left
         | 
| 63 | 
            +
                            row.delete_at 0
         | 
| 64 | 
            +
                            char_width = row.first[:width]
         | 
| 65 | 
            +
                          end
         | 
| 66 | 
            +
                        end
         | 
| 67 | 
            +
             | 
| 68 | 
            +
                        text_chunks << row.first
         | 
| 69 | 
            +
                      end
         | 
| 70 | 
            +
             | 
| 71 | 
            +
                      # Merge chunks that have slightly different :y positions but which
         | 
| 72 | 
            +
                      # mostly overlap in the y dimension.
         | 
| 73 | 
            +
             | 
| 74 | 
            +
                      text_chunks.sort_by! { |obj| obj[:x] }
         | 
| 75 | 
            +
                      merged_text_chunks = []
         | 
| 76 | 
            +
             | 
| 77 | 
            +
                      while text_chunks.count > 1
         | 
| 78 | 
            +
                        left = text_chunks.first
         | 
| 79 | 
            +
                        right = text_chunks[1]
         | 
| 80 | 
            +
             | 
| 81 | 
            +
                        overlap = [left[:height], right[:height]].min - (left[:y] - right[:y]).abs
         | 
| 82 | 
            +
                        overlap = overlap / [left[:height], right[:height]].min
         | 
| 83 | 
            +
             | 
| 84 | 
            +
                        if overlap >= overlap_slop
         | 
| 85 | 
            +
                          # TODO follow char / word slop rules.
         | 
| 86 | 
            +
                          # join
         | 
| 87 | 
            +
                          text_chunks[0] = Spatial.merge left, right
         | 
| 88 | 
            +
                          text_chunks.delete_at 1
         | 
| 89 | 
            +
                        else
         | 
| 90 | 
            +
                          # no join
         | 
| 91 | 
            +
                          merged_text_chunks << text_chunks.first
         | 
| 92 | 
            +
                          text_chunks.delete_at 0
         | 
| 93 | 
            +
                        end
         | 
| 94 | 
            +
                      end
         | 
| 95 | 
            +
             | 
| 96 | 
            +
                      merged_text_chunks << text_chunks.first
         | 
| 97 | 
            +
                    end 
         | 
| 98 | 
            +
                  end
         | 
| 99 | 
            +
                end
         | 
| 100 | 
            +
             | 
| 101 | 
            +
              end
         | 
| 102 | 
            +
            end
         | 
| 103 | 
            +
             | 
| @@ -0,0 +1,112 @@ | |
| 1 | 
            +
            require_relative '../spatial'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module PdfExtract
         | 
| 4 | 
            +
              module Regions
         | 
| 5 | 
            +
             | 
| 6 | 
            +
                Settings.default :line_slop, 1.0
         | 
| 7 | 
            +
             | 
| 8 | 
            +
                # TODO Handle :writing_mode once present in characters and text_chunks.
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                def self.incident l, r
         | 
| 11 | 
            +
                  lx1 = l[:x]
         | 
| 12 | 
            +
                  lx2 = l[:x] + l[:width]
         | 
| 13 | 
            +
                  rx1 = r[:x]
         | 
| 14 | 
            +
                  rx2 = r[:x] + r[:width]
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                  lr = (lx1..lx2)
         | 
| 17 | 
            +
                  rr = (rx1..rx2)
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                  lr.include? rx1 or lr.include? rx2 or rr.include? lx1 or rr.include? lx2
         | 
| 20 | 
            +
                end
         | 
| 21 | 
            +
             | 
| 22 | 
            +
                def self.append_line_offsets region
         | 
| 23 | 
            +
                  region[:lines] ||= []
         | 
| 24 | 
            +
                  region[:lines].each do |line|
         | 
| 25 | 
            +
                    line[:x_offset] = line[:x] - region[:x]
         | 
| 26 | 
            +
                    line[:y_offset] = line[:y] - region[:y]
         | 
| 27 | 
            +
                  end
         | 
| 28 | 
            +
                end
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                def self.append_line_spacing region
         | 
| 31 | 
            +
                  region[:lines] ||= []
         | 
| 32 | 
            +
                  height_taken = 0
         | 
| 33 | 
            +
                  region[:lines].each do |line|
         | 
| 34 | 
            +
                    from_top = region[:height] - (line[:y_offset] + line[:height])
         | 
| 35 | 
            +
                    line[:spacing] = from_top - height_taken
         | 
| 36 | 
            +
                    height_taken = from_top + line[:height]
         | 
| 37 | 
            +
                  end
         | 
| 38 | 
            +
                end
         | 
| 39 | 
            +
                
         | 
| 40 | 
            +
                def self.include_in pdf
         | 
| 41 | 
            +
                  pdf.spatials :regions, :paged => true, :depends_on => [:chunks] do |parser|
         | 
| 42 | 
            +
                    chunks = []
         | 
| 43 | 
            +
                    regions = []
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                    parser.before do
         | 
| 46 | 
            +
                      chunks = []
         | 
| 47 | 
            +
                      regions = []
         | 
| 48 | 
            +
                    end
         | 
| 49 | 
            +
                    
         | 
| 50 | 
            +
                    parser.objects :chunks do |chunk|
         | 
| 51 | 
            +
                      y = chunk[:y].floor
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                      idx = chunks.index { |obj| chunk[:y] <= obj[:y] }
         | 
| 54 | 
            +
                      if idx.nil?
         | 
| 55 | 
            +
                        chunks << chunk.dup
         | 
| 56 | 
            +
                      else
         | 
| 57 | 
            +
                        chunks.insert idx, chunk.dup
         | 
| 58 | 
            +
                      end
         | 
| 59 | 
            +
                    end
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                    # TODO Rewrite to use Spatial::collapse so that text is in proper
         | 
| 62 | 
            +
                    # order.
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                    parser.after do
         | 
| 65 | 
            +
                      # Convert chunks to have line content.
         | 
| 66 | 
            +
                      chunks.each do |chunk|
         | 
| 67 | 
            +
                        chunk[:lines] = [Spatial.as_line(chunk)]
         | 
| 68 | 
            +
                        chunk.delete :content
         | 
| 69 | 
            +
                      end
         | 
| 70 | 
            +
                      
         | 
| 71 | 
            +
                      compare_index = 1
         | 
| 72 | 
            +
                      while chunks.count > compare_index
         | 
| 73 | 
            +
                        b = chunks.first
         | 
| 74 | 
            +
                        t = chunks[compare_index]
         | 
| 75 | 
            +
                          
         | 
| 76 | 
            +
                        line_height = b[:line_height]
         | 
| 77 | 
            +
                        line_slop = [line_height, t[:height]].min * pdf.settings[:line_slop]
         | 
| 78 | 
            +
                        incident_y = (b[:y] + b[:height] + line_slop) >= t[:y]
         | 
| 79 | 
            +
                        
         | 
| 80 | 
            +
                        if incident_y && incident(t, b)
         | 
| 81 | 
            +
                          chunks[0] = Spatial.merge t, b, :lines => true
         | 
| 82 | 
            +
                          chunks.delete_at compare_index
         | 
| 83 | 
            +
                          compare_index = 1
         | 
| 84 | 
            +
                        elsif compare_index < chunks.count - 1
         | 
| 85 | 
            +
                          # Could be more chunks within range.
         | 
| 86 | 
            +
                          compare_index = compare_index.next
         | 
| 87 | 
            +
                        else
         | 
| 88 | 
            +
                          # Finished region.
         | 
| 89 | 
            +
                          regions << chunks.first
         | 
| 90 | 
            +
                          chunks.delete_at 0
         | 
| 91 | 
            +
                          compare_index = 1
         | 
| 92 | 
            +
                        end
         | 
| 93 | 
            +
                      end
         | 
| 94 | 
            +
                      
         | 
| 95 | 
            +
                      regions << chunks.first unless chunks.first.nil?
         | 
| 96 | 
            +
             | 
| 97 | 
            +
                      regions.each do |region|
         | 
| 98 | 
            +
                        append_line_offsets region
         | 
| 99 | 
            +
                        append_line_spacing region
         | 
| 100 | 
            +
             | 
| 101 | 
            +
                        region[:lines].map! do |line|
         | 
| 102 | 
            +
                          Spatial.drop_spatial line
         | 
| 103 | 
            +
                        end
         | 
| 104 | 
            +
                      end
         | 
| 105 | 
            +
             | 
| 106 | 
            +
                      regions.sort_by { |obj| -obj[:y] }
         | 
| 107 | 
            +
                    end
         | 
| 108 | 
            +
                  end
         | 
| 109 | 
            +
                end
         | 
| 110 | 
            +
                
         | 
| 111 | 
            +
              end
         | 
| 112 | 
            +
            end
         | 
    
        data/lib/multi_range.rb
    ADDED
    
    | @@ -0,0 +1,69 @@ | |
| 1 | 
            +
             | 
| 2 | 
            +
            module PdfExtract
         | 
| 3 | 
            +
              class MultiRange
         | 
| 4 | 
            +
             | 
| 5 | 
            +
                attr_accessor :ranges
         | 
| 6 | 
            +
             | 
| 7 | 
            +
                def initialize
         | 
| 8 | 
            +
                  @ranges = []
         | 
| 9 | 
            +
                end
         | 
| 10 | 
            +
             | 
| 11 | 
            +
                def append range
         | 
| 12 | 
            +
                  return if range.max.nil? || range.min.nil?
         | 
| 13 | 
            +
             | 
| 14 | 
            +
                  incident = @ranges.select do |r|
         | 
| 15 | 
            +
                    r.include?(range.min) || r.include?(range.max) ||
         | 
| 16 | 
            +
                      range.include?(r.min) || range.include?(r.max)
         | 
| 17 | 
            +
                  end
         | 
| 18 | 
            +
                  
         | 
| 19 | 
            +
                  incident << range
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                  non_incident = @ranges - incident
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                  non_incident << (incident.collect { |r| r.min }.min .. incident.collect { |r| r.max }.max)
         | 
| 24 | 
            +
                  @ranges = non_incident
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                  @max_excluded = nil
         | 
| 27 | 
            +
                  @min_excluded = nil
         | 
| 28 | 
            +
                  @max = nil
         | 
| 29 | 
            +
                  @min = nil
         | 
| 30 | 
            +
                end
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                def max_excluded
         | 
| 33 | 
            +
                  if @max_excluded.nil?
         | 
| 34 | 
            +
                    @max_excluded = @ranges.first.max if count == 1
         | 
| 35 | 
            +
                    @max_excluded = @ranges.sort_by { |r| -r.min }.first.min unless count == 1
         | 
| 36 | 
            +
                  end
         | 
| 37 | 
            +
                  @max_excluded
         | 
| 38 | 
            +
                end
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                def min_excluded
         | 
| 41 | 
            +
                  if @min_excluded.nil?
         | 
| 42 | 
            +
                    @min_excluded = @ranges.first.min if count == 1
         | 
| 43 | 
            +
                    @min_excluded = @ranges.sort_by { |r| r.max }.first.max unless count == 1
         | 
| 44 | 
            +
                  end
         | 
| 45 | 
            +
                  @min_excluded
         | 
| 46 | 
            +
                end
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                def max
         | 
| 49 | 
            +
                  @max ||= @ranges.sort_by { |r| -r.max }.first.max
         | 
| 50 | 
            +
                end
         | 
| 51 | 
            +
             | 
| 52 | 
            +
                def min
         | 
| 53 | 
            +
                  @min ||= @ranges.sort_by { |r| r.min }.first.min
         | 
| 54 | 
            +
                end
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                def avg
         | 
| 57 | 
            +
                  @ranges.reduce(0) { |sum, r| sum += (r.max - r.min) } / @ranges.count.to_f
         | 
| 58 | 
            +
                end
         | 
| 59 | 
            +
             | 
| 60 | 
            +
                def covered
         | 
| 61 | 
            +
                  @ranges.reduce(0) { |total, r| total += (r.max - r.min) }
         | 
| 62 | 
            +
                end
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                def count
         | 
| 65 | 
            +
                  @ranges.count
         | 
| 66 | 
            +
                end
         | 
| 67 | 
            +
             | 
| 68 | 
            +
              end
         | 
| 69 | 
            +
            end
         | 
    
        data/lib/names.rb
    ADDED
    
    | @@ -0,0 +1,85 @@ | |
| 1 | 
            +
            require "net/http"
         | 
| 2 | 
            +
            require "json"
         | 
| 3 | 
            +
            require "sqlite3"
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            require_relative "pdf-extract"
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            module PdfExtract::Names
         | 
| 8 | 
            +
             | 
| 9 | 
            +
              class NamesDatabase
         | 
| 10 | 
            +
                @@ambiguous_weighting = 0.1
         | 
| 11 | 
            +
                @@unambiguous_weighting = 1.0
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                def self.path_to_data data_filename
         | 
| 14 | 
            +
                  File.join(File.dirname(File.expand_path(__FILE__)), "../data/" + data_filename)
         | 
| 15 | 
            +
                end
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                @@db = SQLite3::Database.new(path_to_data("familynames.db"), {:readonly => true})
         | 
| 18 | 
            +
                @@stop_words = File.open(path_to_data("stopwords.txt")).read.split(",")
         | 
| 19 | 
            +
               
         | 
| 20 | 
            +
                def self.detect_names content
         | 
| 21 | 
            +
                  words = content.split
         | 
| 22 | 
            +
                  sum = 0.0
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                  words.each do |word|
         | 
| 25 | 
            +
                    word = word.downcase
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                    if not @@stop_words.include? word && word.length > 1
         | 
| 28 | 
            +
                      query_word = word.capitalize.gsub(/-(.)/) { |s|
         | 
| 29 | 
            +
                        "-" + s[1].capitalize
         | 
| 30 | 
            +
                      }
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                      @@db.execute("select * from names where name = ?", query_word) do |row|
         | 
| 33 | 
            +
                        if row[2] == 1
         | 
| 34 | 
            +
                          sum += @@ambiguous_weighting
         | 
| 35 | 
            +
                        else
         | 
| 36 | 
            +
                          sum += @@unambiguous_weighting
         | 
| 37 | 
            +
                        end
         | 
| 38 | 
            +
                      end
         | 
| 39 | 
            +
                    end
         | 
| 40 | 
            +
                    
         | 
| 41 | 
            +
                  end
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                  if sum == 0
         | 
| 44 | 
            +
                    {:name_frequency => 0}
         | 
| 45 | 
            +
                  else
         | 
| 46 | 
            +
                    {:name_frequency => (sum / words.length.to_f)}
         | 
| 47 | 
            +
                  end
         | 
| 48 | 
            +
                end
         | 
| 49 | 
            +
              end
         | 
| 50 | 
            +
                  
         | 
| 51 | 
            +
              class NamesService
         | 
| 52 | 
            +
                def self.detect_names content
         | 
| 53 | 
            +
                  data = {:name_frequency => 0.0}
         | 
| 54 | 
            +
                  begin
         | 
| 55 | 
            +
                    response = Net::HTTP.start "names.crrd.dyndns.org" do |http|
         | 
| 56 | 
            +
                      http.post "/detect", content
         | 
| 57 | 
            +
                    end
         | 
| 58 | 
            +
                  
         | 
| 59 | 
            +
                    if response.code == "200"
         | 
| 60 | 
            +
                      data = JSON.parse response.body
         | 
| 61 | 
            +
                    end
         | 
| 62 | 
            +
                  rescue
         | 
| 63 | 
            +
                  end
         | 
| 64 | 
            +
                  data
         | 
| 65 | 
            +
                end
         | 
| 66 | 
            +
              end
         | 
| 67 | 
            +
             | 
| 68 | 
            +
              class NoDetection
         | 
| 69 | 
            +
                def self.detect_names content
         | 
| 70 | 
            +
                  {:name_frequency => 0.0}
         | 
| 71 | 
            +
                end
         | 
| 72 | 
            +
              end
         | 
| 73 | 
            +
             | 
| 74 | 
            +
              @@detector = NamesDatabase
         | 
| 75 | 
            +
             | 
| 76 | 
            +
              def self.detector= detector_class
         | 
| 77 | 
            +
                @@detector = detector_class
         | 
| 78 | 
            +
              end
         | 
| 79 | 
            +
             | 
| 80 | 
            +
              def self.detect_names content
         | 
| 81 | 
            +
                @@detector.detect_names content
         | 
| 82 | 
            +
              end
         | 
| 83 | 
            +
              
         | 
| 84 | 
            +
            end
         | 
| 85 | 
            +
             |