pdf-extract 0.0.10 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/8630-31489-1-PB.mask.pdf +0 -0
 - data/bin/pdf-extract +1 -2
 - data/bin/test2.mask.pdf +0 -0
 - data/bin/test3.mask.pdf +0 -0
 - data/bin/test4.mask.pdf +0 -0
 - data/bin/test5.mask.pdf +0 -0
 - data/bin/test6.mask.pdf +0 -0
 - data/bin/tmp.txt +368 -0
 - data/lib/analysis/columns.rb +9 -5
 - data/lib/analysis/sections.rb +50 -32
 - data/lib/font_metrics.rb +11 -3
 - data/lib/language.rb +9 -9
 - data/lib/model/chunks.rb +8 -4
 - data/lib/model/regions.rb +7 -7
 - data/lib/multi_range.rb +13 -3
 - data/lib/pdf-extract.rb +0 -2
 - data/lib/references/references.rb +16 -15
 - data/lib/references/resolve.rb +15 -15
 - data/lib/references/score.rb +1 -1
 - data/lib/spatial.rb +13 -13
 - metadata +77 -134
 - data/lib/view/png_view.rb +0 -30
 
    
        data/lib/font_metrics.rb
    CHANGED
    
    | 
         @@ -29,7 +29,7 @@ module PdfExtract 
     | 
|
| 
       29 
29 
     | 
    
         
             
                  @ascent = 0
         
     | 
| 
       30 
30 
     | 
    
         
             
                  @descent = 0
         
     | 
| 
       31 
31 
     | 
    
         
             
                  @bbox = [0, 0, 0, 0]
         
     | 
| 
       32 
     | 
    
         
            -
             
     | 
| 
      
 32 
     | 
    
         
            +
             
     | 
| 
       33 
33 
     | 
    
         
             
                  base_font = font.basefont.to_s
         
     | 
| 
       34 
34 
     | 
    
         
             
                  if @@base_fonts.key? base_font
         
     | 
| 
       35 
35 
     | 
    
         
             
                    @ascent = @@base_fonts[base_font][:Ascent]
         
     | 
| 
         @@ -37,12 +37,20 @@ module PdfExtract 
     | 
|
| 
       37 
37 
     | 
    
         
             
                    @bbox = @@base_fonts[base_font][:FontBBox]
         
     | 
| 
       38 
38 
     | 
    
         
             
                    @glyph_width_lookup = proc { |c|
         
     | 
| 
       39 
39 
     | 
    
         
             
                      @@base_fonts[base_font][:Widths].fetch(c.codepoints.first, 0)
         
     | 
| 
       40 
     | 
    
         
            -
                    } 
     | 
| 
      
 40 
     | 
    
         
            +
                    }
         
     | 
| 
       41 
41 
     | 
    
         
             
                  else
         
     | 
| 
       42 
42 
     | 
    
         
             
                    @ascent = font.ascent
         
     | 
| 
       43 
43 
     | 
    
         
             
                    @descent = font.descent
         
     | 
| 
       44 
44 
     | 
    
         
             
                    @bbox = font.bbox
         
     | 
| 
       45 
     | 
    
         
            -
                    @glyph_width_lookup = proc  
     | 
| 
      
 45 
     | 
    
         
            +
                    @glyph_width_lookup = proc do |c|
         
     | 
| 
      
 46 
     | 
    
         
            +
                      begin
         
     | 
| 
      
 47 
     | 
    
         
            +
                        font.glyph_width c.codepoints.first
         
     | 
| 
      
 48 
     | 
    
         
            +
                      rescue TypeError => e
         
     | 
| 
      
 49 
     | 
    
         
            +
                        # It seems some fonts don't have a first char attribute in their
         
     | 
| 
      
 50 
     | 
    
         
            +
                        # descriptor and this causes problems for pdf-reader.
         
     | 
| 
      
 51 
     | 
    
         
            +
                        0
         
     | 
| 
      
 52 
     | 
    
         
            +
                      end
         
     | 
| 
      
 53 
     | 
    
         
            +
                    end
         
     | 
| 
       46 
54 
     | 
    
         
             
                  end
         
     | 
| 
       47 
55 
     | 
    
         | 
| 
       48 
56 
     | 
    
         
             
                  if not @bbox.nil?
         
     | 
    
        data/lib/language.rb
    CHANGED
    
    | 
         @@ -17,7 +17,7 @@ module PdfExtract::Language 
     | 
|
| 
       17 
17 
     | 
    
         
             
                  when "\ufb05" then r << "ft"
         
     | 
| 
       18 
18 
     | 
    
         
             
                  when "\ufb06" then r << "st"
         
     | 
| 
       19 
19 
     | 
    
         
             
                  when "\u1d6b" then r << "ue"
         
     | 
| 
       20 
     | 
    
         
            -
             
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
       21 
21 
     | 
    
         
             
                  # Normalise some punctuation.
         
     | 
| 
       22 
22 
     | 
    
         
             
                  when "\u2018" then r << "'"
         
     | 
| 
       23 
23 
     | 
    
         
             
                  when "\u2019" then r << "'"
         
     | 
| 
         @@ -33,19 +33,19 @@ module PdfExtract::Language 
     | 
|
| 
       33 
33 
     | 
    
         
             
                    r << c
         
     | 
| 
       34 
34 
     | 
    
         
             
                  end
         
     | 
| 
       35 
35 
     | 
    
         
             
                end
         
     | 
| 
       36 
     | 
    
         
            -
             
     | 
| 
      
 36 
     | 
    
         
            +
             
     | 
| 
       37 
37 
     | 
    
         
             
                r.gsub /\s+/, " "
         
     | 
| 
       38 
38 
     | 
    
         
             
              end
         
     | 
| 
       39 
     | 
    
         
            -
             
     | 
| 
      
 39 
     | 
    
         
            +
             
     | 
| 
       40 
40 
     | 
    
         
             
              def self.letter_ratio s
         
     | 
| 
       41 
41 
     | 
    
         
             
                s.count("A-Z0-9\-[],.\"'()") / s.length.to_f
         
     | 
| 
       42 
42 
     | 
    
         
             
              end
         
     | 
| 
       43 
     | 
    
         
            -
             
     | 
| 
      
 43 
     | 
    
         
            +
             
     | 
| 
       44 
44 
     | 
    
         
             
              # TODO Ignore caps in middle of words
         
     | 
| 
       45 
45 
     | 
    
         
             
              def self.cap_ratio s
         
     | 
| 
       46 
46 
     | 
    
         
             
                sentence_end = true
         
     | 
| 
       47 
47 
     | 
    
         
             
                cap_count = 0
         
     | 
| 
       48 
     | 
    
         
            -
             
     | 
| 
      
 48 
     | 
    
         
            +
             
     | 
| 
       49 
49 
     | 
    
         
             
                s.each_char do |c|
         
     | 
| 
       50 
50 
     | 
    
         
             
                  if c =~ /\./
         
     | 
| 
       51 
51 
     | 
    
         
             
                    sentence_end = true
         
     | 
| 
         @@ -56,13 +56,13 @@ module PdfExtract::Language 
     | 
|
| 
       56 
56 
     | 
    
         
             
                    sentence_end = false
         
     | 
| 
       57 
57 
     | 
    
         
             
                  end
         
     | 
| 
       58 
58 
     | 
    
         
             
                end
         
     | 
| 
       59 
     | 
    
         
            -
             
     | 
| 
      
 59 
     | 
    
         
            +
             
     | 
| 
       60 
60 
     | 
    
         
             
                cap_count / s.split.length.to_f
         
     | 
| 
       61 
61 
     | 
    
         
             
              end
         
     | 
| 
       62 
     | 
    
         
            -
             
     | 
| 
      
 62 
     | 
    
         
            +
             
     | 
| 
       63 
63 
     | 
    
         
             
              def self.year_ratio s
         
     | 
| 
       64 
64 
     | 
    
         
             
                words = s.split
         
     | 
| 
       65 
     | 
    
         
            -
             
     | 
| 
      
 65 
     | 
    
         
            +
             
     | 
| 
       66 
66 
     | 
    
         
             
                year_words = words.map do |word|
         
     | 
| 
       67 
67 
     | 
    
         
             
                  word =~ /[^\d]\d{4}[^\d]/
         
     | 
| 
       68 
68 
     | 
    
         
             
                end
         
     | 
| 
         @@ -77,6 +77,6 @@ module PdfExtract::Language 
     | 
|
| 
       77 
77 
     | 
    
         
             
              def self.word_count s
         
     | 
| 
       78 
78 
     | 
    
         
             
                s.split.count
         
     | 
| 
       79 
79 
     | 
    
         
             
              end
         
     | 
| 
       80 
     | 
    
         
            -
             
     | 
| 
      
 80 
     | 
    
         
            +
             
     | 
| 
       81 
81 
     | 
    
         
             
            end
         
     | 
| 
       82 
82 
     | 
    
         | 
    
        data/lib/model/chunks.rb
    CHANGED
    
    | 
         @@ -24,14 +24,14 @@ module PdfExtract 
     | 
|
| 
       24 
24 
     | 
    
         
             
                }
         
     | 
| 
       25 
25 
     | 
    
         | 
| 
       26 
26 
     | 
    
         
             
                def self.include_in pdf
         
     | 
| 
       27 
     | 
    
         
            -
             
     | 
| 
      
 27 
     | 
    
         
            +
             
     | 
| 
       28 
28 
     | 
    
         
             
                  pdf.spatials :chunks, :paged => true, :depends_on => [:characters] do |parser|
         
     | 
| 
       29 
29 
     | 
    
         
             
                    rows = {}
         
     | 
| 
       30 
30 
     | 
    
         | 
| 
       31 
31 
     | 
    
         
             
                    parser.before do
         
     | 
| 
       32 
32 
     | 
    
         
             
                      rows = {}
         
     | 
| 
       33 
33 
     | 
    
         
             
                    end
         
     | 
| 
       34 
     | 
    
         
            -
             
     | 
| 
      
 34 
     | 
    
         
            +
             
     | 
| 
       35 
35 
     | 
    
         
             
                    parser.objects :characters do |chars|
         
     | 
| 
       36 
36 
     | 
    
         
             
                      y = chars[:y]
         
     | 
| 
       37 
37 
     | 
    
         
             
                      rows[y] = [] if rows[y].nil?
         
     | 
| 
         @@ -48,7 +48,7 @@ module PdfExtract 
     | 
|
| 
       48 
48 
     | 
    
         
             
                      char_slop = pdf.settings[:char_slop]
         
     | 
| 
       49 
49 
     | 
    
         
             
                      word_slop = pdf.settings[:word_slop]
         
     | 
| 
       50 
50 
     | 
    
         
             
                      overlap_slop = pdf.settings[:overlap_slop]
         
     | 
| 
       51 
     | 
    
         
            -
             
     | 
| 
      
 51 
     | 
    
         
            +
             
     | 
| 
       52 
52 
     | 
    
         
             
                      text_chunks = []
         
     | 
| 
       53 
53 
     | 
    
         | 
| 
       54 
54 
     | 
    
         
             
                      rows.each_pair do |y, row|
         
     | 
| 
         @@ -105,7 +105,11 @@ module PdfExtract 
     | 
|
| 
       105 
105 
     | 
    
         
             
                      end
         
     | 
| 
       106 
106 
     | 
    
         | 
| 
       107 
107 
     | 
    
         
             
                      merged_text_chunks << text_chunks.first
         
     | 
| 
       108 
     | 
    
         
            -
             
     | 
| 
      
 108 
     | 
    
         
            +
             
     | 
| 
      
 109 
     | 
    
         
            +
                      # Remove empty lines - they mess up region detection by
         
     | 
| 
      
 110 
     | 
    
         
            +
                      # making them join together.
         
     | 
| 
      
 111 
     | 
    
         
            +
                      merged_text_chunks.reject { |chunk| chunk[:content].strip == "" }
         
     | 
| 
      
 112 
     | 
    
         
            +
                    end
         
     | 
| 
       109 
113 
     | 
    
         
             
                  end
         
     | 
| 
       110 
114 
     | 
    
         
             
                end
         
     | 
| 
       111 
115 
     | 
    
         | 
    
        data/lib/model/regions.rb
    CHANGED
    
    | 
         @@ -41,7 +41,7 @@ to be part of the same region. :line_slop is multiplied by the average line heig 
     | 
|
| 
       41 
41 
     | 
    
         
             
                    height_taken = from_top + line[:height]
         
     | 
| 
       42 
42 
     | 
    
         
             
                  end
         
     | 
| 
       43 
43 
     | 
    
         
             
                end
         
     | 
| 
       44 
     | 
    
         
            -
             
     | 
| 
      
 44 
     | 
    
         
            +
             
     | 
| 
       45 
45 
     | 
    
         
             
                def self.include_in pdf
         
     | 
| 
       46 
46 
     | 
    
         
             
                  pdf.spatials :regions, :paged => true, :depends_on => [:chunks] do |parser|
         
     | 
| 
       47 
47 
     | 
    
         
             
                    chunks = []
         
     | 
| 
         @@ -51,7 +51,7 @@ to be part of the same region. :line_slop is multiplied by the average line heig 
     | 
|
| 
       51 
51 
     | 
    
         
             
                      chunks = []
         
     | 
| 
       52 
52 
     | 
    
         
             
                      regions = []
         
     | 
| 
       53 
53 
     | 
    
         
             
                    end
         
     | 
| 
       54 
     | 
    
         
            -
             
     | 
| 
      
 54 
     | 
    
         
            +
             
     | 
| 
       55 
55 
     | 
    
         
             
                    parser.objects :chunks do |chunk|
         
     | 
| 
       56 
56 
     | 
    
         
             
                      y = chunk[:y].floor
         
     | 
| 
       57 
57 
     | 
    
         | 
| 
         @@ -72,16 +72,16 @@ to be part of the same region. :line_slop is multiplied by the average line heig 
     | 
|
| 
       72 
72 
     | 
    
         
             
                        chunk[:lines] = [Spatial.as_line(chunk)]
         
     | 
| 
       73 
73 
     | 
    
         
             
                        chunk.delete :content
         
     | 
| 
       74 
74 
     | 
    
         
             
                      end
         
     | 
| 
       75 
     | 
    
         
            -
             
     | 
| 
      
 75 
     | 
    
         
            +
             
     | 
| 
       76 
76 
     | 
    
         
             
                      compare_index = 1
         
     | 
| 
       77 
77 
     | 
    
         
             
                      while chunks.count > compare_index
         
     | 
| 
       78 
78 
     | 
    
         
             
                        b = chunks.first
         
     | 
| 
       79 
79 
     | 
    
         
             
                        t = chunks[compare_index]
         
     | 
| 
       80 
     | 
    
         
            -
             
     | 
| 
      
 80 
     | 
    
         
            +
             
     | 
| 
       81 
81 
     | 
    
         
             
                        line_height = b[:line_height]
         
     | 
| 
       82 
82 
     | 
    
         
             
                        line_slop = [line_height, t[:height]].min * pdf.settings[:line_slop]
         
     | 
| 
       83 
83 
     | 
    
         
             
                        incident_y = (b[:y] + b[:height] + line_slop) >= t[:y]
         
     | 
| 
       84 
     | 
    
         
            -
             
     | 
| 
      
 84 
     | 
    
         
            +
             
     | 
| 
       85 
85 
     | 
    
         
             
                        if incident_y && incident(t, b)
         
     | 
| 
       86 
86 
     | 
    
         
             
                          chunks[0] = Spatial.merge t, b, :lines => true
         
     | 
| 
       87 
87 
     | 
    
         
             
                          chunks.delete_at compare_index
         
     | 
| 
         @@ -96,7 +96,7 @@ to be part of the same region. :line_slop is multiplied by the average line heig 
     | 
|
| 
       96 
96 
     | 
    
         
             
                          compare_index = 1
         
     | 
| 
       97 
97 
     | 
    
         
             
                        end
         
     | 
| 
       98 
98 
     | 
    
         
             
                      end
         
     | 
| 
       99 
     | 
    
         
            -
             
     | 
| 
      
 99 
     | 
    
         
            +
             
     | 
| 
       100 
100 
     | 
    
         
             
                      regions << chunks.first unless chunks.first.nil?
         
     | 
| 
       101 
101 
     | 
    
         | 
| 
       102 
102 
     | 
    
         
             
                      regions.each do |region|
         
     | 
| 
         @@ -112,6 +112,6 @@ to be part of the same region. :line_slop is multiplied by the average line heig 
     | 
|
| 
       112 
112 
     | 
    
         
             
                    end
         
     | 
| 
       113 
113 
     | 
    
         
             
                  end
         
     | 
| 
       114 
114 
     | 
    
         
             
                end
         
     | 
| 
       115 
     | 
    
         
            -
             
     | 
| 
      
 115 
     | 
    
         
            +
             
     | 
| 
       116 
116 
     | 
    
         
             
              end
         
     | 
| 
       117 
117 
     | 
    
         
             
            end
         
     | 
    
        data/lib/multi_range.rb
    CHANGED
    
    | 
         @@ -15,7 +15,7 @@ module PdfExtract 
     | 
|
| 
       15 
15 
     | 
    
         
             
                    r.include?(range.min) || r.include?(range.max) ||
         
     | 
| 
       16 
16 
     | 
    
         
             
                      range.include?(r.min) || range.include?(r.max)
         
     | 
| 
       17 
17 
     | 
    
         
             
                  end
         
     | 
| 
       18 
     | 
    
         
            -
             
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
       19 
19 
     | 
    
         
             
                  incident << range
         
     | 
| 
       20 
20 
     | 
    
         | 
| 
       21 
21 
     | 
    
         
             
                  non_incident = @ranges - incident
         
     | 
| 
         @@ -46,11 +46,21 @@ module PdfExtract 
     | 
|
| 
       46 
46 
     | 
    
         
             
                end
         
     | 
| 
       47 
47 
     | 
    
         | 
| 
       48 
48 
     | 
    
         
             
                def max
         
     | 
| 
       49 
     | 
    
         
            -
                  @ 
     | 
| 
      
 49 
     | 
    
         
            +
                  @ranges.sort_by { |r| -r.max }.first.max
         
     | 
| 
       50 
50 
     | 
    
         
             
                end
         
     | 
| 
       51 
51 
     | 
    
         | 
| 
       52 
52 
     | 
    
         
             
                def min
         
     | 
| 
       53 
     | 
    
         
            -
                  @ 
     | 
| 
      
 53 
     | 
    
         
            +
                  @ranges.sort_by { |r| r.min }.first.min
         
     | 
| 
      
 54 
     | 
    
         
            +
                end
         
     | 
| 
      
 55 
     | 
    
         
            +
             
     | 
| 
      
 56 
     | 
    
         
            +
                def widest
         
     | 
| 
      
 57 
     | 
    
         
            +
                  widest = @ranges.sort_by { |r| r.max - r.min }.last
         
     | 
| 
      
 58 
     | 
    
         
            +
                  widest.max - widest.min
         
     | 
| 
      
 59 
     | 
    
         
            +
                end
         
     | 
| 
      
 60 
     | 
    
         
            +
             
     | 
| 
      
 61 
     | 
    
         
            +
                def narrowest
         
     | 
| 
      
 62 
     | 
    
         
            +
                  narrowest = @ranges.sort_by { |r| r.max - r.min }.first
         
     | 
| 
      
 63 
     | 
    
         
            +
                  narrowest.max - narrowest.min
         
     | 
| 
       54 
64 
     | 
    
         
             
                end
         
     | 
| 
       55 
65 
     | 
    
         | 
| 
       56 
66 
     | 
    
         
             
                def avg
         
     | 
    
        data/lib/pdf-extract.rb
    CHANGED
    
    | 
         @@ -9,7 +9,6 @@ require_relative 'analysis/columns' 
     | 
|
| 
       9 
9 
     | 
    
         
             
            require_relative 'analysis/sections'
         
     | 
| 
       10 
10 
     | 
    
         
             
            require_relative 'references/references'
         
     | 
| 
       11 
11 
     | 
    
         
             
            require_relative 'references/resolved_references'
         
     | 
| 
       12 
     | 
    
         
            -
            require_relative 'view/png_view'
         
     | 
| 
       13 
12 
     | 
    
         
             
            require_relative 'view/pdf_view'
         
     | 
| 
       14 
13 
     | 
    
         
             
            require_relative 'view/xml_view'
         
     | 
| 
       15 
14 
     | 
    
         | 
| 
         @@ -68,7 +67,6 @@ module PdfExtract 
     | 
|
| 
       68 
67 
     | 
    
         
             
                add_parser ResolvedReferences
         
     | 
| 
       69 
68 
     | 
    
         | 
| 
       70 
69 
     | 
    
         
             
                add_view :pdf, PdfView
         
     | 
| 
       71 
     | 
    
         
            -
                add_view :png, PngView
         
     | 
| 
       72 
70 
     | 
    
         
             
                add_view :xml, XmlView
         
     | 
| 
       73 
71 
     | 
    
         
             
              end
         
     | 
| 
       74 
72 
     | 
    
         | 
| 
         @@ -6,7 +6,7 @@ module PdfExtract 
     | 
|
| 
       6 
6 
     | 
    
         
             
              module References
         
     | 
| 
       7 
7 
     | 
    
         | 
| 
       8 
8 
     | 
    
         
             
                Settings.declare :reference_flex, {
         
     | 
| 
       9 
     | 
    
         
            -
                  :default => 0. 
     | 
| 
      
 9 
     | 
    
         
            +
                  :default => 0.2,
         
     | 
| 
       10 
10 
     | 
    
         
             
                  :module => self.name,
         
     | 
| 
       11 
11 
     | 
    
         
             
                  :description => "Article sections are given a score as potential reference sections. Their score is based on article section features, such as the number of family names that appear, the ratio of uppercase letters to lowercase, and so on. Any article section that has a score that is more than 1 - :reference_flex percent of the best score will be parsed as a reference section."
         
     | 
| 
       12 
12 
     | 
    
         
             
                }
         
     | 
| 
         @@ -16,7 +16,7 @@ module PdfExtract 
     | 
|
| 
       16 
16 
     | 
    
         
             
                  :module => self.name,
         
     | 
| 
       17 
17 
     | 
    
         
             
                  :description => "There must be :min_sequence_count or more numbered references within a candidate reference section for them to be parsed as number-delimited references."
         
     | 
| 
       18 
18 
     | 
    
         
             
                }
         
     | 
| 
       19 
     | 
    
         
            -
             
     | 
| 
      
 19 
     | 
    
         
            +
             
     | 
| 
       20 
20 
     | 
    
         
             
                Settings.declare :max_reference_order, {
         
     | 
| 
       21 
21 
     | 
    
         
             
                  :default => 1000,
         
     | 
| 
       22 
22 
     | 
    
         
             
                  :module => self.name,
         
     | 
| 
         @@ -82,11 +82,11 @@ module PdfExtract 
     | 
|
| 
       82 
82 
     | 
    
         | 
| 
       83 
83 
     | 
    
         
             
                  # Determine the charcaters that are most likely part of numeric
         
     | 
| 
       84 
84 
     | 
    
         
             
                  # delimiters.
         
     | 
| 
       85 
     | 
    
         
            -
             
     | 
| 
      
 85 
     | 
    
         
            +
             
     | 
| 
       86 
86 
     | 
    
         
             
                  after = {}
         
     | 
| 
       87 
87 
     | 
    
         
             
                  before = {}
         
     | 
| 
       88 
88 
     | 
    
         
             
                  last_n = -1
         
     | 
| 
       89 
     | 
    
         
            -
             
     | 
| 
      
 89 
     | 
    
         
            +
             
     | 
| 
       90 
90 
     | 
    
         
             
                  s.scan /[^\d]?\d+[^\d]/ do |m|
         
     | 
| 
       91 
91 
     | 
    
         
             
                    n = m[/\d+/].to_i
         
     | 
| 
       92 
92 
     | 
    
         
             
                    if n < pdf.settings[:max_reference_order]
         
     | 
| 
         @@ -115,14 +115,14 @@ module PdfExtract 
     | 
|
| 
       115 
115 
     | 
    
         
             
                  if ["", "\\[", "\\ "].include?(b_s) && ["", "\\.", "\\]", "\\ "].include?(a_s)
         
     | 
| 
       116 
116 
     | 
    
         | 
| 
       117 
117 
     | 
    
         
             
                    # Split by the delimiters and record separate refs.
         
     | 
| 
       118 
     | 
    
         
            -
             
     | 
| 
      
 118 
     | 
    
         
            +
             
     | 
| 
       119 
119 
     | 
    
         
             
                    last_n = -1
         
     | 
| 
       120 
120 
     | 
    
         
             
                    current_ref = ""
         
     | 
| 
       121 
121 
     | 
    
         
             
                    refs = []
         
     | 
| 
       122 
122 
     | 
    
         
             
                    parts = s.partition(Regexp.new "#{b_s}?\\d+#{a_s}")
         
     | 
| 
       123 
     | 
    
         
            -
             
     | 
| 
      
 123 
     | 
    
         
            +
             
     | 
| 
       124 
124 
     | 
    
         
             
                    while not parts[1].length.zero?
         
     | 
| 
       125 
     | 
    
         
            -
                      n = parts[1][/\d+/].to_i 
     | 
| 
      
 125 
     | 
    
         
            +
                      n = parts[1][/\d+/].to_i
         
     | 
| 
       126 
126 
     | 
    
         
             
                      if n < pdf.settings[:max_reference_order] && last_n == -1
         
     | 
| 
       127 
127 
     | 
    
         
             
                        last_n = n
         
     | 
| 
       128 
128 
     | 
    
         
             
                      elsif n == last_n.next
         
     | 
| 
         @@ -139,12 +139,12 @@ module PdfExtract 
     | 
|
| 
       139 
139 
     | 
    
         | 
| 
       140 
140 
     | 
    
         
             
                      parts = parts[2].partition(Regexp.new "#{b_s}?\\d+#{a_s}")
         
     | 
| 
       141 
141 
     | 
    
         
             
                    end
         
     | 
| 
       142 
     | 
    
         
            -
             
     | 
| 
      
 142 
     | 
    
         
            +
             
     | 
| 
       143 
143 
     | 
    
         
             
                    refs << {
         
     | 
| 
       144 
144 
     | 
    
         
             
                      :content => (current_ref + parts[0]).strip,
         
     | 
| 
       145 
145 
     | 
    
         
             
                      :order => last_n
         
     | 
| 
       146 
146 
     | 
    
         
             
                    }
         
     | 
| 
       147 
     | 
    
         
            -
             
     | 
| 
      
 147 
     | 
    
         
            +
             
     | 
| 
       148 
148 
     | 
    
         
             
                    refs
         
     | 
| 
       149 
149 
     | 
    
         | 
| 
       150 
150 
     | 
    
         
             
                  else
         
     | 
| 
         @@ -177,7 +177,7 @@ module PdfExtract 
     | 
|
| 
       177 
177 
     | 
    
         | 
| 
       178 
178 
     | 
    
         
             
                  seq_count >= pdf.settings[:min_sequence_count]
         
     | 
| 
       179 
179 
     | 
    
         
             
                end
         
     | 
| 
       180 
     | 
    
         
            -
             
     | 
| 
      
 180 
     | 
    
         
            +
             
     | 
| 
       181 
181 
     | 
    
         
             
                def self.include_in pdf
         
     | 
| 
       182 
182 
     | 
    
         
             
                  pdf.spatials :references, :depends_on => [:sections] do |parser|
         
     | 
| 
       183 
183 
     | 
    
         | 
| 
         @@ -190,7 +190,7 @@ module PdfExtract 
     | 
|
| 
       190 
190 
     | 
    
         
             
                    parser.after do
         
     | 
| 
       191 
191 
     | 
    
         
             
                      max_score = sections.map {|s| s[:reference_score]}.max
         
     | 
| 
       192 
192 
     | 
    
         
             
                      min_permittable = max_score - (max_score * pdf.settings[:reference_flex])
         
     | 
| 
       193 
     | 
    
         
            -
             
     | 
| 
      
 193 
     | 
    
         
            +
             
     | 
| 
       194 
194 
     | 
    
         
             
                      refs = []
         
     | 
| 
       195 
195 
     | 
    
         | 
| 
       196 
196 
     | 
    
         
             
                      sections = sections.reject do |s|
         
     | 
| 
         @@ -199,13 +199,14 @@ module PdfExtract 
     | 
|
| 
       199 
199 
     | 
    
         
             
                        # half of an article.
         
     | 
| 
       200 
200 
     | 
    
         
             
                        s[:lateness] < pdf.settings[:min_lateness] || s[:year_ratio].zero?
         
     | 
| 
       201 
201 
     | 
    
         
             
                      end
         
     | 
| 
       202 
     | 
    
         
            -
             
     | 
| 
      
 202 
     | 
    
         
            +
             
     | 
| 
       203 
203 
     | 
    
         
             
                      sections.each do |section|
         
     | 
| 
       204 
204 
     | 
    
         
             
                        if section[:reference_score] >= min_permittable
         
     | 
| 
       205 
205 
     | 
    
         
             
                        # TODO Enable classification once we have a reasonable model.
         
     | 
| 
       206 
206 
     | 
    
         
             
                        #if Score.reference?(section)
         
     | 
| 
       207 
     | 
    
         
            -
                           
     | 
| 
       208 
     | 
    
         
            -
             
     | 
| 
      
 207 
     | 
    
         
            +
                          content = Spatial.get_text_content(section)
         
     | 
| 
      
 208 
     | 
    
         
            +
                          if numeric_sequence? pdf, content
         
     | 
| 
      
 209 
     | 
    
         
            +
                            refs += split_by_delimiter pdf, content
         
     | 
| 
       209 
210 
     | 
    
         
             
                          elsif multi_margin? section[:lines]
         
     | 
| 
       210 
211 
     | 
    
         
             
                            refs += split_by_margin section[:lines]
         
     | 
| 
       211 
212 
     | 
    
         
             
                          elsif multi_spacing? section[:lines]
         
     | 
| 
         @@ -213,7 +214,7 @@ module PdfExtract 
     | 
|
| 
       213 
214 
     | 
    
         
             
                          end
         
     | 
| 
       214 
215 
     | 
    
         
             
                        end
         
     | 
| 
       215 
216 
     | 
    
         
             
                      end
         
     | 
| 
       216 
     | 
    
         
            -
             
     | 
| 
      
 217 
     | 
    
         
            +
             
     | 
| 
       217 
218 
     | 
    
         
             
                      # TODO Ideally we wouldn't see the ref headers here.
         
     | 
| 
       218 
219 
     | 
    
         
             
                      # Unfortunately publication details can look a lot like references.
         
     | 
| 
       219 
220 
     | 
    
         
             
                      refs.reject do |ref|
         
     | 
    
        data/lib/references/resolve.rb
    CHANGED
    
    | 
         @@ -12,12 +12,12 @@ module PdfExtract::Resolve 
     | 
|
| 
       12 
12 
     | 
    
         
             
                  resolved = {}
         
     | 
| 
       13 
13 
     | 
    
         
             
                  begin
         
     | 
| 
       14 
14 
     | 
    
         
             
                    doc = Nokogiri::HTML(open url)
         
     | 
| 
       15 
     | 
    
         
            -
             
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
       16 
16 
     | 
    
         
             
                    result = doc.at_css "div.result"
         
     | 
| 
       17 
17 
     | 
    
         
             
                    unless result.nil?
         
     | 
| 
       18 
18 
     | 
    
         
             
                      score = result.at_css("span.cr_score").content.to_s
         
     | 
| 
       19 
19 
     | 
    
         
             
                      if score.to_i >= 90
         
     | 
| 
       20 
     | 
    
         
            -
                        doi = result.at_css "span.doi" 
     | 
| 
      
 20 
     | 
    
         
            +
                        doi = result.at_css "span.doi"
         
     | 
| 
       21 
21 
     | 
    
         
             
                        resolved[:doi] = doi.content.sub "http://dx.doi.org/", ""
         
     | 
| 
       22 
22 
     | 
    
         
             
                      end
         
     | 
| 
       23 
23 
     | 
    
         
             
                    end
         
     | 
| 
         @@ -25,17 +25,17 @@ module PdfExtract::Resolve 
     | 
|
| 
       25 
25 
     | 
    
         
             
                  end
         
     | 
| 
       26 
26 
     | 
    
         
             
                  resolved
         
     | 
| 
       27 
27 
     | 
    
         
             
                end
         
     | 
| 
       28 
     | 
    
         
            -
             
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
       29 
29 
     | 
    
         
             
              end
         
     | 
| 
       30 
     | 
    
         
            -
             
     | 
| 
      
 30 
     | 
    
         
            +
             
     | 
| 
       31 
31 
     | 
    
         
             
              class FreeCite
         
     | 
| 
       32 
     | 
    
         
            -
             
     | 
| 
      
 32 
     | 
    
         
            +
             
     | 
| 
       33 
33 
     | 
    
         
             
                def self.find ref
         
     | 
| 
       34 
34 
     | 
    
         
             
                  Net::HTTP.start "freecite.library.brown.edu" do |http|
         
     | 
| 
       35 
35 
     | 
    
         
             
                    r = http.post "/citations/create", "citation=#{ref}",
         
     | 
| 
       36 
36 
     | 
    
         
             
                                  "Accept" => "text/xml"
         
     | 
| 
       37 
37 
     | 
    
         
             
                    doc = Nokogiri::XML r.body
         
     | 
| 
       38 
     | 
    
         
            -
             
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
       39 
39 
     | 
    
         
             
                    {
         
     | 
| 
       40 
40 
     | 
    
         
             
                      :title => doc.at_xpath("//title").content,
         
     | 
| 
       41 
41 
     | 
    
         
             
                      :journal => doc.at_xpath("//journal").content,
         
     | 
| 
         @@ -44,13 +44,13 @@ module PdfExtract::Resolve 
     | 
|
| 
       44 
44 
     | 
    
         
             
                    }
         
     | 
| 
       45 
45 
     | 
    
         
             
                  end
         
     | 
| 
       46 
46 
     | 
    
         
             
                end
         
     | 
| 
       47 
     | 
    
         
            -
             
     | 
| 
      
 47 
     | 
    
         
            +
             
     | 
| 
       48 
48 
     | 
    
         
             
              end
         
     | 
| 
       49 
     | 
    
         
            -
             
     | 
| 
      
 49 
     | 
    
         
            +
             
     | 
| 
       50 
50 
     | 
    
         
             
              class SimpleTextQuery
         
     | 
| 
       51 
51 
     | 
    
         | 
| 
       52 
52 
     | 
    
         
             
                @@cookie = nil
         
     | 
| 
       53 
     | 
    
         
            -
             
     | 
| 
      
 53 
     | 
    
         
            +
             
     | 
| 
       54 
54 
     | 
    
         
             
                def self.find ref
         
     | 
| 
       55 
55 
     | 
    
         
             
                  create_session
         
     | 
| 
       56 
56 
     | 
    
         | 
| 
         @@ -68,10 +68,10 @@ module PdfExtract::Resolve 
     | 
|
| 
       68 
68 
     | 
    
         
             
                  response = Net::HTTP.start "www.crossref.org" do |http|
         
     | 
| 
       69 
69 
     | 
    
         
             
                    http.request post
         
     | 
| 
       70 
70 
     | 
    
         
             
                  end
         
     | 
| 
       71 
     | 
    
         
            -
             
     | 
| 
      
 71 
     | 
    
         
            +
             
     | 
| 
       72 
72 
     | 
    
         
             
                  doc = Nokogiri::HTML response.body
         
     | 
| 
       73 
73 
     | 
    
         
             
                  doi = doc.at_css "td.resultB > a"
         
     | 
| 
       74 
     | 
    
         
            -
             
     | 
| 
      
 74 
     | 
    
         
            +
             
     | 
| 
       75 
75 
     | 
    
         
             
                  if doi.nil?
         
     | 
| 
       76 
76 
     | 
    
         
             
                    {}
         
     | 
| 
       77 
77 
     | 
    
         
             
                  else
         
     | 
| 
         @@ -87,11 +87,11 @@ module PdfExtract::Resolve 
     | 
|
| 
       87 
87 
     | 
    
         
             
                    end
         
     | 
| 
       88 
88 
     | 
    
         
             
                  end
         
     | 
| 
       89 
89 
     | 
    
         
             
                end
         
     | 
| 
       90 
     | 
    
         
            -
             
     | 
| 
      
 90 
     | 
    
         
            +
             
     | 
| 
       91 
91 
     | 
    
         
             
              end
         
     | 
| 
       92 
     | 
    
         
            -
             
     | 
| 
      
 92 
     | 
    
         
            +
             
     | 
| 
       93 
93 
     | 
    
         
             
              @@resolvers = [Sigg]
         
     | 
| 
       94 
     | 
    
         
            -
             
     | 
| 
      
 94 
     | 
    
         
            +
             
     | 
| 
       95 
95 
     | 
    
         
             
              def self.resolvers= resolver
         
     | 
| 
       96 
96 
     | 
    
         
             
                @@resolvers = resolver
         
     | 
| 
       97 
97 
     | 
    
         
             
              end
         
     | 
| 
         @@ -109,5 +109,5 @@ module PdfExtract::Resolve 
     | 
|
| 
       109 
109 
     | 
    
         
             
                end
         
     | 
| 
       110 
110 
     | 
    
         
             
                ref
         
     | 
| 
       111 
111 
     | 
    
         
             
              end
         
     | 
| 
       112 
     | 
    
         
            -
             
     | 
| 
      
 112 
     | 
    
         
            +
             
     | 
| 
       113 
113 
     | 
    
         
             
            end
         
     | 
    
        data/lib/references/score.rb
    CHANGED
    
    
    
        data/lib/spatial.rb
    CHANGED
    
    | 
         @@ -24,13 +24,13 @@ module PdfExtract 
     | 
|
| 
       24 
24 
     | 
    
         | 
| 
       25 
25 
     | 
    
         
             
                def self.merge_lines a, b, so
         
     | 
| 
       26 
26 
     | 
    
         
             
                  so[:lines] = []
         
     | 
| 
       27 
     | 
    
         
            -
             
     | 
| 
      
 27 
     | 
    
         
            +
             
     | 
| 
       28 
28 
     | 
    
         
             
                  if a.key? :lines
         
     | 
| 
       29 
29 
     | 
    
         
             
                    so[:lines] += a[:lines]
         
     | 
| 
       30 
30 
     | 
    
         
             
                  else
         
     | 
| 
       31 
31 
     | 
    
         
             
                    so[:lines] << as_line(a)
         
     | 
| 
       32 
32 
     | 
    
         
             
                  end
         
     | 
| 
       33 
     | 
    
         
            -
             
     | 
| 
      
 33 
     | 
    
         
            +
             
     | 
| 
       34 
34 
     | 
    
         
             
                  if b.key? :lines
         
     | 
| 
       35 
35 
     | 
    
         
             
                    so[:lines] += b[:lines]
         
     | 
| 
       36 
36 
     | 
    
         
             
                  else
         
     | 
| 
         @@ -60,7 +60,7 @@ module PdfExtract 
     | 
|
| 
       60 
60 
     | 
    
         
             
                    so[:content] = (a[:content] + options[:separator] + b[:content])
         
     | 
| 
       61 
61 
     | 
    
         
             
                    so[:content] = so[:content].gsub /\s+/, " "
         
     | 
| 
       62 
62 
     | 
    
         
             
                  end
         
     | 
| 
       63 
     | 
    
         
            -
             
     | 
| 
      
 63 
     | 
    
         
            +
             
     | 
| 
       64 
64 
     | 
    
         
             
                  if get_text_content(a).length > get_text_content(b).length
         
     | 
| 
       65 
65 
     | 
    
         
             
                    so[:font] = a[:font]
         
     | 
| 
       66 
66 
     | 
    
         
             
                    so[:line_height] = a[:line_height]
         
     | 
| 
         @@ -115,12 +115,12 @@ module PdfExtract 
     | 
|
| 
       115 
115 
     | 
    
         
             
                # correct write order, specified by write_mode.
         
     | 
| 
       116 
116 
     | 
    
         
             
                def self.collapse objs, options={}
         
     | 
| 
       117 
117 
     | 
    
         
             
                  options = @@default_options.merge options
         
     | 
| 
       118 
     | 
    
         
            -
             
     | 
| 
      
 118 
     | 
    
         
            +
             
     | 
| 
       119 
119 
     | 
    
         
             
                  sorted = case write_mode
         
     | 
| 
       120 
120 
     | 
    
         
             
                           when :left_to_right
         
     | 
| 
       121 
121 
     | 
    
         
             
                             objs.sort_by { |obj| -(obj[:y].floor * 100) + (obj[:x] / 100.0) }
         
     | 
| 
       122 
122 
     | 
    
         
             
                           end
         
     | 
| 
       123 
     | 
    
         
            -
             
     | 
| 
      
 123 
     | 
    
         
            +
             
     | 
| 
       124 
124 
     | 
    
         
             
                  if sorted.count == 1
         
     | 
| 
       125 
125 
     | 
    
         
             
                    sorted.first.dup
         
     | 
| 
       126 
126 
     | 
    
         
             
                  else
         
     | 
| 
         @@ -132,18 +132,18 @@ module PdfExtract 
     | 
|
| 
       132 
132 
     | 
    
         
             
                  end
         
     | 
| 
       133 
133 
     | 
    
         
             
                end
         
     | 
| 
       134 
134 
     | 
    
         | 
| 
       135 
     | 
    
         
            -
                def self.contains? a, b
         
     | 
| 
       136 
     | 
    
         
            -
                  a_x1 = a[:x]
         
     | 
| 
       137 
     | 
    
         
            -
                  a_x2 = a[:x] + a[:width]
         
     | 
| 
       138 
     | 
    
         
            -
                  a_y1 = a[:y]
         
     | 
| 
       139 
     | 
    
         
            -
                  a_y2 = a[:y] + a[:height]
         
     | 
| 
      
 135 
     | 
    
         
            +
                def self.contains? a, b, padding=0
         
     | 
| 
      
 136 
     | 
    
         
            +
                  a_x1 = a[:x] - padding
         
     | 
| 
      
 137 
     | 
    
         
            +
                  a_x2 = a[:x] + a[:width] + (padding * 2)
         
     | 
| 
      
 138 
     | 
    
         
            +
                  a_y1 = a[:y] - padding
         
     | 
| 
      
 139 
     | 
    
         
            +
                  a_y2 = a[:y] + a[:height] + (padding * 2)
         
     | 
| 
       140 
140 
     | 
    
         | 
| 
       141 
141 
     | 
    
         
             
                  b_x1 = b[:x]
         
     | 
| 
       142 
142 
     | 
    
         
             
                  b_x2 = b[:x] + b[:width]
         
     | 
| 
       143 
143 
     | 
    
         
             
                  b_y1 = b[:y]
         
     | 
| 
       144 
144 
     | 
    
         
             
                  b_y2 = b[:y] + b[:height]
         
     | 
| 
       145 
145 
     | 
    
         | 
| 
       146 
     | 
    
         
            -
                  b_x1 >= a_x1 && b_x2 <= a_x2 && b_y1 >= a_y1 && b_y2 <= a_y2 
     | 
| 
      
 146 
     | 
    
         
            +
                  b_x1 >= a_x1 && b_x2 <= a_x2 && b_y1 >= a_y1 && b_y2 <= a_y2
         
     | 
| 
       147 
147 
     | 
    
         
             
                end
         
     | 
| 
       148 
148 
     | 
    
         | 
| 
       149 
149 
     | 
    
         
             
                def self.overlap? from, by, a, b
         
     | 
| 
         @@ -158,7 +158,7 @@ module PdfExtract 
     | 
|
| 
       158 
158 
     | 
    
         
             
                    diffs = items.map {|item| (item[f] - ideals[f][0]).abs}
         
     | 
| 
       159 
159 
     | 
    
         
             
                    diffs.map! {|d| d.nan? ? 1 : d}
         
     | 
| 
       160 
160 
     | 
    
         
             
                    max_diff = diffs.max
         
     | 
| 
       161 
     | 
    
         
            -
             
     | 
| 
      
 161 
     | 
    
         
            +
             
     | 
| 
       162 
162 
     | 
    
         
             
                    scores = diffs.map do |d|
         
     | 
| 
       163 
163 
     | 
    
         
             
                      if d == 0
         
     | 
| 
       164 
164 
     | 
    
         
             
                        ideals[f][1]
         
     | 
| 
         @@ -173,6 +173,6 @@ module PdfExtract 
     | 
|
| 
       173 
173 
     | 
    
         
             
                    end
         
     | 
| 
       174 
174 
     | 
    
         
             
                  end
         
     | 
| 
       175 
175 
     | 
    
         
             
                end
         
     | 
| 
       176 
     | 
    
         
            -
             
     | 
| 
      
 176 
     | 
    
         
            +
             
     | 
| 
       177 
177 
     | 
    
         
             
              end
         
     | 
| 
       178 
178 
     | 
    
         
             
            end
         
     |