math_metadata_lookup 0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,39 @@
1
+ module MathMetadata
2
+
3
+ class Entity
4
+
5
+ def initialize( meta={} )
6
+ @metadata = meta.dup
7
+ end
8
+
9
+ def method_missing( meth, *args )
10
+ case meth.to_s
11
+ when /(.*?)=/
12
+ self[$1] = args.first
13
+ else
14
+ self[meth]
15
+ end
16
+ end
17
+
18
+ def [](key)
19
+ @metadata[key.to_sym]
20
+ end
21
+
22
+ def []=(key, value)
23
+ @metadata[key.to_sym] = value
24
+ end
25
+
26
+ def format( f=:ruby )
27
+ result = self
28
+
29
+ case f.to_sym
30
+ when :text, :html, :xml
31
+ result = self.send("to_#{f}")
32
+ end
33
+
34
+ result
35
+ end
36
+
37
+ end
38
+
39
+ end
@@ -0,0 +1,85 @@
1
+ # -*-: coding: utf-8 -*-
2
+ # vi: fenc=utf-8:expandtab:ts=2:sw=2:sts=2
3
+
4
+ module MathMetadata
5
+
6
+ # Main class for searching through all sites
7
+ class Lookup
8
+ attr_accessor :options
9
+
10
+ # :sites can be :all or array of allowed sites ([:mrev, :zbl])
11
+ def initialize( opts={} )
12
+ @options = { :sites => :all, :verbose => true }.merge(opts)
13
+ @sites = []
14
+ end
15
+
16
+ # calls method for each site
17
+ def method_missing(meth, *args)
18
+ result = []
19
+
20
+ sites = SITES.dup
21
+ if (@options[:sites] != :all) or @options[:sites].kind_of?(Array)
22
+ allowed = [@options[:sites]].flatten
23
+ sites.delete_if{|s| not allowed.include?(s::ID) }
24
+ end
25
+
26
+ sites.each do |klass|
27
+ site = klass.new(:verbose => @options[:verbose], :nwords => args[0][:nwords])
28
+
29
+ entry = {:site => klass::ID, :name => klass::NAME, :url => klass::URL}
30
+ entry[:result] = site.send(meth, *args)
31
+
32
+ result << entry
33
+ end
34
+
35
+ Result.new(result)
36
+ end
37
+
38
+
39
+ # try to decide what is best result for query and combine results from all sites to one article response
40
+ def heuristic( args={} )
41
+ opts = {:threshold => 0.6}.merge(args)
42
+ result = Result.new
43
+
44
+ # use only authors surnames
45
+ args_dup = args.dup
46
+ args_dup[:authors].map!{|a| a =~ /([^,]+)/; $1 ? $1 : a}
47
+ args_dup[:authors].map!{|a| a =~ /([^ ]+) \S+/; $1 ? $1 : a}
48
+ args_dup[:nwords] = 2
49
+ sites = article(args_dup)
50
+
51
+ # query article has to contain full names
52
+ query_article = Article.new( {:title => args[:title].to_s, :authors => args[:authors], :year => args[:year]} )
53
+ sites.each do |site|
54
+ site[:result].to_a.each do |article|
55
+ next if article[:title].to_s.empty?
56
+ article[:similarity] = query_article.similarity(article)
57
+ end
58
+ site[:result].to_a.delete_if{|a| a[:similarity].to_f < opts[:threshold].to_f}
59
+ if site[:result].to_a.size > 0
60
+ site[:result].sort!{|a,b| a[:similarity]<=>b[:similarity]}
61
+ site[:result].reverse!
62
+ site[:result] = [site[:result].to_a.first]
63
+ end
64
+ end
65
+
66
+ sites
67
+ end
68
+
69
+
70
+ # parse reference string and execute heuristic to query for article in databases
71
+ def reference( args={} )
72
+ ref = Reference.new args[:reference]
73
+ pp ref if args[:verbose]
74
+
75
+ opts = {:threshold => 0.6}.merge(args)
76
+ opts[:title] = ref.article[:title]
77
+ opts[:authors] = ref.article[:authors]
78
+ opts[:year] = ref.article[:year]
79
+
80
+ heuristic opts
81
+ end
82
+
83
+ end # Lookup
84
+
85
+ end # module
@@ -0,0 +1,122 @@
1
+ # -*-: coding: utf-8 -*-
2
+ # vi: fenc=utf-8:expandtab:ts=2:sw=2:sts=2
3
+
4
+ module MathMetadata
5
+
6
+ class Reference
7
+
8
+ # 1=authors, 2=title, 3=publication, 4=year, 5=range
9
+ ARTICLE_REFERENCE_1_RE = %r{([^:]+):\s*(.*?),\s*([^,]+),\s*\((\d{4})\)\s*,\s*([^ ]+)\s*.*?}mi
10
+ # 1=authors, 2=title, 3=publication, 4=range, 5=publisher, 6=place, 7=year
11
+ ARTICLE_REFERENCE_2_RE = %r{([^:]+):\s*(.*?),\s*(.*?,\s*[^,]+,\s*[^,]+,\s*[^,]+),\s*pp\.\s*([^,]+?),\s*([^,]+),\s*(.*?),\s*(\d{4})\s*.*?}mi
12
+ # 1=authors, 2=title, 3=range, 4=publication, 5=place, 6=year
13
+ ARTICLE_REFERENCE_3_RE = %r{([^:]+):\s*(.*?),\s*pp\.\s*([^,]+?),\s*([^,]+),\s*(.*?),\s*(\d{4})}mi
14
+ # 1=authors, 2=title, 3=publication, 4=publisher, 5=place, 6=year
15
+ ARTICLE_REFERENCE_4_RE = %r{([^:]+):\s*(.*?),\s*(.*?),\s*([^,]+),\s*([^,]+),\s*(\d{4})\s*.*?}mi
16
+ # 1=authors, 2=title, 4=publisher, 5=place, 6=year
17
+ ARTICLE_REFERENCE_5_RE = %r{([^:]+):\s*(.*?),\s*(.*?),\s*([^,]+),\s*(\d{4})\s*.*?}mi
18
+ # 1=authors, 2=title, 3=publisher, 4=place, 5=year
19
+ ARTICLE_REFERENCE_6_RE = %r{([^:]+):\s*(.*?),\s*([^,]+),\s*([^,]+),\s*(\d{4})\s*}mi
20
+ # 1=authors, 2=title, 3=publication, 4=year, 5=range
21
+ ARTICLE_REFERENCE_7_RE = %r{([^:]+):\s*(.*),\s*(.*?,\s*\d+)\s*\((\d{4})\),\s*([^ ]+)\s*.*?}mi
22
+ # 1=authors, 2=title, 3=publication, 4=year, 5=range
23
+ ARTICLE_REFERENCE_8_RE = %r{([^:]+):\s*(.*),\s*(.*?)\s*\((\d{4})\),\s*([^ ]+)\s*.*?}mi
24
+ # 1=authors, 2=title, 3=publisher, 4=place
25
+ ARTICLE_REFERENCE_9_RE = %r{([^:]+):\s*(.*?),\s*([^,]+),\s*(.*)}mi
26
+ # 1=authors, 2=title, 3=publication
27
+ ARTICLE_REFERENCE_10_RE = %r{([^:]+):\s*(.*?),\s*(.*?)\s*.*?}mi
28
+ # 1=authors, 2=title, 3=place, 4=year
29
+ ARTICLE_REFERENCE_11_RE = %r{([^:]+):\s*(.*),(.*?)\s+(\d{4})}mi
30
+
31
+
32
+ attr_accessor :source, :article, :suffix, :number, :reg
33
+
34
+ def initialize( str=nil, i=1 )
35
+ @number = i
36
+ if str.kind_of?(Article)
37
+ @source = @suffix = nil
38
+ @article = str
39
+ else
40
+ @source = str
41
+ @article, @suffix = Reference.parse(str) unless str.to_s.empty?
42
+ end
43
+ end
44
+
45
+
46
+ def self.parse( str )
47
+ article = Article.new
48
+ rnumber = 0
49
+ suffix = nil
50
+ found = []
51
+ (1..11).each do |j|
52
+ # 1=authors, 2=title, 3=publication, 4=year, 5=range, 6=id, 7=place, 8=publisher
53
+ re = eval("Reference::ARTICLE_REFERENCE_#{j}_RE")
54
+ if str =~ re
55
+ case j
56
+ when 1
57
+ # 1=authors, 2=title, 3=publication, 4=year, 5=range, 6=id
58
+ found = [$1, $2, $3, $4, MathMetadata.normalize_range($5), nil]
59
+ when 2
60
+ # 1=authors, 2=title, 3=publication, 4=range, 5=publisher, 6=place, 7=year, 8=id
61
+ found = [$1, $2, $3, $7, MathMetadata.normalize_range($4), nil, $6, $5]
62
+ when 3
63
+ # 1=authors, 2=title, 3=range, 4=publication, 5=place, 6=year
64
+ found = [$1, $2, $4, $6, MathMetadata.normalize_range($3), nil, $5]
65
+ when 4
66
+ # 1=authors, 2=title, 3=publication, 4=publisher, 5=place, 6=year, 7=id
67
+ found = [$1, $2, $3, $6, nil, nil, $5, $4]
68
+ when 5
69
+ # 1=authors, 2=title, 3=publisher, 4=place, 5=year, 6=id
70
+ found = [$1, $2, nil, $5, nil, nil, $4, $3]
71
+ when 6
72
+ # 1=authors, 2=title, 3=publisher, 4=place, 5=year, 6=id
73
+ found = [$1, $2, nil, $5, nil, nil, $4, $3]
74
+ when 7
75
+ # 1=authors, 2=title, 3=publication, 4=year, 5=range, 6=id
76
+ found = [$1, $2, $3, $4, MathMetadata.normalize_range($5), nil]
77
+ when 8
78
+ # 1=authors, 2=title, 3=publication, 4=year, 5=range, 6=id
79
+ found = [$1, $2, $3, $4, MathMetadata.normalize_range($5), nil]
80
+ when 9
81
+ # 1=authors, 2=title, 3=publisher, 4=place
82
+ found = [$1, $2, nil, nil, nil, nil, $4, $3]
83
+ when 10
84
+ # 1=authors, 2=title, 3=publication, 4=id
85
+ found = [$1, $2, $3, nil, nil, nil, nil, nil]
86
+ when 11
87
+ # 1=authors, 2=title, 3=place, 4=year
88
+ found = [$1, $2, nil, $4, nil, nil, $3]
89
+ end
90
+ rnumber = j
91
+ break
92
+ end
93
+ end
94
+
95
+ [:authors, :title, :publication, :year, :range, :id, :place, :publisher].each_with_index do |key, idx|
96
+ article[key] = found[idx]
97
+ end
98
+ article.authors = Reference.split_authors article.authors
99
+
100
+ [article, suffix, rnumber]
101
+ end
102
+
103
+
104
+ def self.split_authors( str )
105
+ res = [
106
+ /;\s*/,
107
+ /,?\s*(?:and|und|et)\s+/,
108
+ /(\S+,\s*[^,]+),?\s*/
109
+ ]
110
+
111
+ authors = [str]
112
+ res.each do |re|
113
+ authors = authors.map{|a| a.to_s.split(re)}.flatten
114
+ end
115
+ authors.delete_if{|a| a.strip.empty?}
116
+
117
+ authors
118
+ end
119
+
120
+ end
121
+
122
+ end
@@ -0,0 +1,97 @@
1
+ # -*-: coding: utf-8 -*-
2
+ # vi: fenc=utf-8:expandtab:ts=2:sw=2:sts=2
3
+
4
+ module MathMetadata
5
+
6
+ class Result
7
+ include Enumerable
8
+
9
+ FORMATS = [:ruby, :yaml, :xml, :html, :text]
10
+
11
+ def initialize( meta=[] )
12
+ @metadata = meta
13
+ end
14
+
15
+
16
+ def <<(val)
17
+ @metadata << val
18
+ end
19
+
20
+
21
+ def each
22
+ @metadata.each do |site|
23
+ yield site
24
+ end
25
+ end
26
+
27
+
28
+ def format( f=:ruby )
29
+ self.send "to_#{f}"
30
+ end
31
+
32
+
33
+ def to_html
34
+ result = ""
35
+ @metadata.each do |site|
36
+ result << %~
37
+ <div class="site">
38
+ <h3>Site: #{site[:name]}</h3>~
39
+ site[:result].each do |entity|
40
+ result << entity.to_html
41
+ end
42
+ result << %~</div>~
43
+ end
44
+ result
45
+ end
46
+
47
+
48
+ def to_xml
49
+ result = ""
50
+
51
+ result << %~<?xml version="1.0" encoding="utf-8"?>
52
+ <mml>~
53
+ @metadata.each do |site|
54
+ result << %~
55
+ <site name="#{site[:name]}">~
56
+ site[:result].each do |entity|
57
+ result << entity.to_xml
58
+ end
59
+ result << %~
60
+ </site>
61
+ ~
62
+ end
63
+ result << %~</mml>~
64
+
65
+ result
66
+ end
67
+
68
+
69
+ def to_yaml
70
+ @metadata.to_yaml
71
+ end
72
+
73
+
74
+ def to_array
75
+ @metadata
76
+ end
77
+
78
+
79
+ def to_text
80
+ result = ""
81
+ @metadata.each do |site|
82
+ next unless site[:result]
83
+ result << "Site: #{site[:name]}\n"
84
+ result << "URL: #{site[:url]}\n"
85
+ result << "\n"
86
+ site[:result].each do |entity|
87
+ result << entity.to_text
88
+ end
89
+ result << "\n"
90
+ end
91
+ result
92
+ end
93
+ alias :to_str :to_text
94
+
95
+ end
96
+
97
+ end # MathMetadata
@@ -0,0 +1,221 @@
1
+ # -*-: coding: utf-8 -*-
2
+ # vi: fenc=utf-8:expandtab:ts=2:sw=2:sts=2
3
+
4
+ require 'htmlentities'
5
+ require 'open-uri'
6
+ require 'i18n'
7
+ require 'cgi'
8
+
9
+
10
+ module MathMetadata
11
+
12
+ SITES = []
13
+
14
+ # Abstract class. Inherit in your sites definition.
15
+ class Site
16
+
17
+ def initialize( opts={} )
18
+ @options = { :verbose => true }.merge(opts)
19
+ end
20
+
21
+ # register new site class
22
+ def self.inherited( site )
23
+ SITES << site
24
+ end
25
+
26
+
27
+ # search for authors
28
+ def author( args={} )
29
+ opts = {:name => nil}.merge(args)
30
+ anf = author_name_forms opts[:name]
31
+
32
+ authors = []
33
+ anf.each do |af|
34
+ entry = Author.new({:id => af[1], :preferred => af[0], :forms => af[2]})
35
+ authors << entry unless entry[:id].to_s.strip.empty?
36
+ end
37
+
38
+ authors
39
+ end
40
+
41
+
42
+ # search for articles
43
+ def article( args={} )
44
+ opts = {:id => nil, :title => "", :year => "", :authors => [], :references => true}.merge(args)
45
+
46
+ page = fetch_article(opts)
47
+ articles = []
48
+
49
+ return metadata unless page
50
+
51
+ if list_of_articles?(page)
52
+ articles = get_article_list(page)
53
+ else
54
+ a = get_article(page, opts)
55
+ articles << a unless a[:title].to_s.strip.empty?
56
+ end
57
+
58
+ return nil if articles.size == 0
59
+ articles
60
+ end
61
+
62
+
63
+ protected
64
+
65
+
66
+ def method_missing(meth, *args)
67
+ page = args.first
68
+
69
+ case meth.to_s
70
+ when /^list_of_(.*)\?$/
71
+ re = eval("self.class::LIST_OF_#{$1.upcase}_RE")
72
+ return page =~ re
73
+ when /^get_(.*)_m$/
74
+ re = eval("self.class::#{$1.upcase}_RE")
75
+ re_s = eval("self.class::#{$1.upcase}S_RE")
76
+ m, n = args[1,2]
77
+ m ||= 1
78
+ n ||= 1
79
+ res = []
80
+ page.scan(re_s) do |match|
81
+ entry = []
82
+ m.times {|i| entry << match[i].to_s.strip}
83
+ entry << []
84
+ match[m].scan(re) do |form|
85
+ n.times {|i| entry[m] << form[i]}
86
+ end if match[m]
87
+ res << entry
88
+ end
89
+ return res
90
+
91
+ when /^get_(.*)_s$/
92
+ res = []
93
+ what = $1
94
+ re = eval("self.class::#{what.upcase}_RE")
95
+ re_s = eval("self.class::#{what.upcase}S_RE")
96
+ page =~ re_s
97
+ entries = $1
98
+ entries.to_s.strip.scan(re) do |match|
99
+ res << match[0].to_s.strip
100
+ end
101
+ return res
102
+
103
+ when /^get_(.*)$/
104
+ match = eval("self.class::#{$1.upcase}_RE").match(page).to_a.map{|x| x.to_s.strip}
105
+ match.shift
106
+ return match.first if args[1].to_i <= 1
107
+ return match
108
+ end
109
+ end
110
+
111
+
112
+ # search for author name forms
113
+ def author_name_forms( name )
114
+ forms = []
115
+
116
+ page = fetch_author name
117
+ forms = get_author_m page, 2, 1
118
+
119
+ forms
120
+ end
121
+
122
+
123
+ def get_article_references( page )
124
+ references = []
125
+
126
+ refs = get_article_reference_s page
127
+
128
+ i = 0;
129
+ refs.each do |r|
130
+ i+=1
131
+ ref = Reference.new r.gsub(/<.*?>/,'').gsub(/ +/,' ').strip, i
132
+ references << ref
133
+ end
134
+
135
+ references
136
+ end
137
+
138
+
139
+ def get_article_msc( page )
140
+ mscs = get_article_msc_s page
141
+ mscs = MathMetadata.normalize_mscs(mscs)
142
+ mscs
143
+ end
144
+
145
+
146
+ def get_article( page, opts={} )
147
+ a = Article.new( {
148
+ :id => get_article_id(page),
149
+ :authors => get_article_author_s(page),
150
+ :msc => get_article_msc(page),
151
+ :publication => get_article_publication(page),
152
+ :range => MathMetadata.normalize_range(get_article_range(page)),
153
+ :year => get_article_year(page),
154
+ :keywords => get_article_keyword_s(page),
155
+ :issn => get_article_issn_s(page)
156
+ } )
157
+
158
+ a.title, a.language = get_article_title(page, 2)
159
+ a.title = a.title.to_s.gsub(/<\/span>/,'')
160
+ a.references = get_article_references(page) if opts[:references]
161
+
162
+ a
163
+ end
164
+
165
+
166
+ def get_article_list( page )
167
+ articles = []
168
+ page.scan(self.class::ARTICLE_ENTRY_RE).each do |match|
169
+ a = article(:id => match[0]).first
170
+ articles << a unless a[:title].to_s.strip.empty?
171
+ end
172
+ articles
173
+ end
174
+
175
+
176
+ def nwords(s)
177
+ s.split(" ")[0...@options[:nwords].to_i].join(" ")
178
+ end
179
+
180
+
181
+ def fetch_page( url, args={} )
182
+ opts = {:entities => true}.merge(args)
183
+
184
+ puts "fetching #{url}" if @options[:verbose]
185
+ page = URI.parse(url).read
186
+ page = HTMLEntities.decode_entities(page) if page and opts[:entities]
187
+
188
+ page
189
+ end
190
+
191
+
192
+ def fetch_author( name )
193
+ nn = MathMetadata.normalize_name(name)
194
+ url = self.class::AUTHOR_URL % URI.escape(nn)
195
+
196
+ fetch_page(url)
197
+ end
198
+
199
+
200
+ def join_article_authors( authors )
201
+ authors.collect { |author| URI.escape MathMetadata.normalize_name(author) }.join('; ') || ''
202
+ end
203
+
204
+ def fetch_article( args={} )
205
+ opts = {:id => nil, :title => "", :year => "", :authors => []}.merge(args)
206
+ url = self.class::ARTICLE_ID_URL % URI.escape(opts[:id].to_s.strip)
207
+ if opts[:id].to_s.strip.empty?
208
+ author = join_article_authors opts[:authors]
209
+ title = opts[:title]
210
+ title = '' if not title.kind_of?(String)
211
+ title = MathMetadata.normalize_text(title)
212
+ title = nwords(title) if @options[:nwords]
213
+ url = self.class::ARTICLE_URL % [URI.escape(title), author, opts[:year].to_s]
214
+ end
215
+
216
+ fetch_page(url, opts)
217
+ end
218
+
219
+ end # Site
220
+
221
+ end # Module