math_metadata_lookup 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,39 @@
1
+ module MathMetadata
2
+
3
+ class Entity
4
+
5
+ def initialize( meta={} )
6
+ @metadata = meta.dup
7
+ end
8
+
9
+ def method_missing( meth, *args )
10
+ case meth.to_s
11
+ when /(.*?)=/
12
+ self[$1] = args.first
13
+ else
14
+ self[meth]
15
+ end
16
+ end
17
+
18
+ def [](key)
19
+ @metadata[key.to_sym]
20
+ end
21
+
22
+ def []=(key, value)
23
+ @metadata[key.to_sym] = value
24
+ end
25
+
26
+ def format( f=:ruby )
27
+ result = self
28
+
29
+ case f.to_sym
30
+ when :text, :html, :xml
31
+ result = self.send("to_#{f}")
32
+ end
33
+
34
+ result
35
+ end
36
+
37
+ end
38
+
39
+ end
@@ -0,0 +1,85 @@
1
+ # -*-: coding: utf-8 -*-
2
+ # vi: fenc=utf-8:expandtab:ts=2:sw=2:sts=2
3
+
4
+ module MathMetadata
5
+
6
+ # Main class for searching through all sites
7
+ class Lookup
8
+ attr_accessor :options
9
+
10
+ # :sites can be :all or array of allowed sites ([:mrev, :zbl])
11
+ def initialize( opts={} )
12
+ @options = { :sites => :all, :verbose => true }.merge(opts)
13
+ @sites = []
14
+ end
15
+
16
+ # calls method for each site
17
+ def method_missing(meth, *args)
18
+ result = []
19
+
20
+ sites = SITES.dup
21
+ if (@options[:sites] != :all) or @options[:sites].kind_of?(Array)
22
+ allowed = [@options[:sites]].flatten
23
+ sites.delete_if{|s| not allowed.include?(s::ID) }
24
+ end
25
+
26
+ sites.each do |klass|
27
+ site = klass.new(:verbose => @options[:verbose], :nwords => args[0][:nwords])
28
+
29
+ entry = {:site => klass::ID, :name => klass::NAME, :url => klass::URL}
30
+ entry[:result] = site.send(meth, *args)
31
+
32
+ result << entry
33
+ end
34
+
35
+ Result.new(result)
36
+ end
37
+
38
+
39
+ # try to decide what is best result for query and combine results from all sites to one article response
40
+ def heuristic( args={} )
41
+ opts = {:threshold => 0.6}.merge(args)
42
+ result = Result.new
43
+
44
+ # use only authors surnames
45
+ args_dup = args.dup
46
+ args_dup[:authors].map!{|a| a =~ /([^,]+)/; $1 ? $1 : a}
47
+ args_dup[:authors].map!{|a| a =~ /([^ ]+) \S+/; $1 ? $1 : a}
48
+ args_dup[:nwords] = 2
49
+ sites = article(args_dup)
50
+
51
+ # query article has to contain full names
52
+ query_article = Article.new( {:title => args[:title].to_s, :authors => args[:authors], :year => args[:year]} )
53
+ sites.each do |site|
54
+ site[:result].to_a.each do |article|
55
+ next if article[:title].to_s.empty?
56
+ article[:similarity] = query_article.similarity(article)
57
+ end
58
+ site[:result].to_a.delete_if{|a| a[:similarity].to_f < opts[:threshold].to_f}
59
+ if site[:result].to_a.size > 0
60
+ site[:result].sort!{|a,b| a[:similarity]<=>b[:similarity]}
61
+ site[:result].reverse!
62
+ site[:result] = [site[:result].to_a.first]
63
+ end
64
+ end
65
+
66
+ sites
67
+ end
68
+
69
+
70
+ # parse reference string and execute heuristic to query for article in databases
71
+ def reference( args={} )
72
+ ref = Reference.new args[:reference]
73
+ pp ref if args[:verbose]
74
+
75
+ opts = {:threshold => 0.6}.merge(args)
76
+ opts[:title] = ref.article[:title]
77
+ opts[:authors] = ref.article[:authors]
78
+ opts[:year] = ref.article[:year]
79
+
80
+ heuristic opts
81
+ end
82
+
83
+ end # Lookup
84
+
85
+ end # module
@@ -0,0 +1,122 @@
1
+ # -*-: coding: utf-8 -*-
2
+ # vi: fenc=utf-8:expandtab:ts=2:sw=2:sts=2
3
+
4
+ module MathMetadata
5
+
6
+ class Reference
7
+
8
+ # 1=authors, 2=title, 3=publication, 4=year, 5=range
9
+ ARTICLE_REFERENCE_1_RE = %r{([^:]+):\s*(.*?),\s*([^,]+),\s*\((\d{4})\)\s*,\s*([^ ]+)\s*.*?}mi
10
+ # 1=authors, 2=title, 3=publication, 4=range, 5=publisher, 6=place, 7=year
11
+ ARTICLE_REFERENCE_2_RE = %r{([^:]+):\s*(.*?),\s*(.*?,\s*[^,]+,\s*[^,]+,\s*[^,]+),\s*pp\.\s*([^,]+?),\s*([^,]+),\s*(.*?),\s*(\d{4})\s*.*?}mi
12
+ # 1=authors, 2=title, 3=range, 4=publication, 5=place, 6=year
13
+ ARTICLE_REFERENCE_3_RE = %r{([^:]+):\s*(.*?),\s*pp\.\s*([^,]+?),\s*([^,]+),\s*(.*?),\s*(\d{4})}mi
14
+ # 1=authors, 2=title, 3=publication, 4=publisher, 5=place, 6=year
15
+ ARTICLE_REFERENCE_4_RE = %r{([^:]+):\s*(.*?),\s*(.*?),\s*([^,]+),\s*([^,]+),\s*(\d{4})\s*.*?}mi
16
+ # 1=authors, 2=title, 4=publisher, 5=place, 6=year
17
+ ARTICLE_REFERENCE_5_RE = %r{([^:]+):\s*(.*?),\s*(.*?),\s*([^,]+),\s*(\d{4})\s*.*?}mi
18
+ # 1=authors, 2=title, 3=publisher, 4=place, 5=year
19
+ ARTICLE_REFERENCE_6_RE = %r{([^:]+):\s*(.*?),\s*([^,]+),\s*([^,]+),\s*(\d{4})\s*}mi
20
+ # 1=authors, 2=title, 3=publication, 4=year, 5=range
21
+ ARTICLE_REFERENCE_7_RE = %r{([^:]+):\s*(.*),\s*(.*?,\s*\d+)\s*\((\d{4})\),\s*([^ ]+)\s*.*?}mi
22
+ # 1=authors, 2=title, 3=publication, 4=year, 5=range
23
+ ARTICLE_REFERENCE_8_RE = %r{([^:]+):\s*(.*),\s*(.*?)\s*\((\d{4})\),\s*([^ ]+)\s*.*?}mi
24
+ # 1=authors, 2=title, 3=publisher, 4=place
25
+ ARTICLE_REFERENCE_9_RE = %r{([^:]+):\s*(.*?),\s*([^,]+),\s*(.*)}mi
26
+ # 1=authors, 2=title, 3=publication
27
+ ARTICLE_REFERENCE_10_RE = %r{([^:]+):\s*(.*?),\s*(.*?)\s*.*?}mi
28
+ # 1=authors, 2=title, 3=place, 4=year
29
+ ARTICLE_REFERENCE_11_RE = %r{([^:]+):\s*(.*),(.*?)\s+(\d{4})}mi
30
+
31
+
32
+ attr_accessor :source, :article, :suffix, :number, :reg
33
+
34
+ def initialize( str=nil, i=1 )
35
+ @number = i
36
+ if str.kind_of?(Article)
37
+ @source = @suffix = nil
38
+ @article = str
39
+ else
40
+ @source = str
41
+ @article, @suffix = Reference.parse(str) unless str.to_s.empty?
42
+ end
43
+ end
44
+
45
+
46
+ def self.parse( str )
47
+ article = Article.new
48
+ rnumber = 0
49
+ suffix = nil
50
+ found = []
51
+ (1..11).each do |j|
52
+ # 1=authors, 2=title, 3=publication, 4=year, 5=range, 6=id, 7=place, 8=publisher
53
+ re = eval("Reference::ARTICLE_REFERENCE_#{j}_RE")
54
+ if str =~ re
55
+ case j
56
+ when 1
57
+ # 1=authors, 2=title, 3=publication, 4=year, 5=range, 6=id
58
+ found = [$1, $2, $3, $4, MathMetadata.normalize_range($5), nil]
59
+ when 2
60
+ # 1=authors, 2=title, 3=publication, 4=range, 5=publisher, 6=place, 7=year, 8=id
61
+ found = [$1, $2, $3, $7, MathMetadata.normalize_range($4), nil, $6, $5]
62
+ when 3
63
+ # 1=authors, 2=title, 3=range, 4=publication, 5=place, 6=year
64
+ found = [$1, $2, $4, $6, MathMetadata.normalize_range($3), nil, $5]
65
+ when 4
66
+ # 1=authors, 2=title, 3=publication, 4=publisher, 5=place, 6=year, 7=id
67
+ found = [$1, $2, $3, $6, nil, nil, $5, $4]
68
+ when 5
69
+ # 1=authors, 2=title, 3=publisher, 4=place, 5=year, 6=id
70
+ found = [$1, $2, nil, $5, nil, nil, $4, $3]
71
+ when 6
72
+ # 1=authors, 2=title, 3=publisher, 4=place, 5=year, 6=id
73
+ found = [$1, $2, nil, $5, nil, nil, $4, $3]
74
+ when 7
75
+ # 1=authors, 2=title, 3=publication, 4=year, 5=range, 6=id
76
+ found = [$1, $2, $3, $4, MathMetadata.normalize_range($5), nil]
77
+ when 8
78
+ # 1=authors, 2=title, 3=publication, 4=year, 5=range, 6=id
79
+ found = [$1, $2, $3, $4, MathMetadata.normalize_range($5), nil]
80
+ when 9
81
+ # 1=authors, 2=title, 3=publisher, 4=place
82
+ found = [$1, $2, nil, nil, nil, nil, $4, $3]
83
+ when 10
84
+ # 1=authors, 2=title, 3=publication, 4=id
85
+ found = [$1, $2, $3, nil, nil, nil, nil, nil]
86
+ when 11
87
+ # 1=authors, 2=title, 3=place, 4=year
88
+ found = [$1, $2, nil, $4, nil, nil, $3]
89
+ end
90
+ rnumber = j
91
+ break
92
+ end
93
+ end
94
+
95
+ [:authors, :title, :publication, :year, :range, :id, :place, :publisher].each_with_index do |key, idx|
96
+ article[key] = found[idx]
97
+ end
98
+ article.authors = Reference.split_authors article.authors
99
+
100
+ [article, suffix, rnumber]
101
+ end
102
+
103
+
104
+ def self.split_authors( str )
105
+ res = [
106
+ /;\s*/,
107
+ /,?\s*(?:and|und|et)\s+/,
108
+ /(\S+,\s*[^,]+),?\s*/
109
+ ]
110
+
111
+ authors = [str]
112
+ res.each do |re|
113
+ authors = authors.map{|a| a.to_s.split(re)}.flatten
114
+ end
115
+ authors.delete_if{|a| a.strip.empty?}
116
+
117
+ authors
118
+ end
119
+
120
+ end
121
+
122
+ end
@@ -0,0 +1,97 @@
1
+ # -*-: coding: utf-8 -*-
2
+ # vi: fenc=utf-8:expandtab:ts=2:sw=2:sts=2
3
+
4
+ module MathMetadata
5
+
6
+ class Result
7
+ include Enumerable
8
+
9
+ FORMATS = [:ruby, :yaml, :xml, :html, :text]
10
+
11
+ def initialize( meta=[] )
12
+ @metadata = meta
13
+ end
14
+
15
+
16
+ def <<(val)
17
+ @metadata << val
18
+ end
19
+
20
+
21
+ def each
22
+ @metadata.each do |site|
23
+ yield site
24
+ end
25
+ end
26
+
27
+
28
+ def format( f=:ruby )
29
+ self.send "to_#{f}"
30
+ end
31
+
32
+
33
+ def to_html
34
+ result = ""
35
+ @metadata.each do |site|
36
+ result << %~
37
+ <div class="site">
38
+ <h3>Site: #{site[:name]}</h3>~
39
+ site[:result].each do |entity|
40
+ result << entity.to_html
41
+ end
42
+ result << %~</div>~
43
+ end
44
+ result
45
+ end
46
+
47
+
48
+ def to_xml
49
+ result = ""
50
+
51
+ result << %~<?xml version="1.0" encoding="utf-8"?>
52
+ <mml>~
53
+ @metadata.each do |site|
54
+ result << %~
55
+ <site name="#{site[:name]}">~
56
+ site[:result].each do |entity|
57
+ result << entity.to_xml
58
+ end
59
+ result << %~
60
+ </site>
61
+ ~
62
+ end
63
+ result << %~</mml>~
64
+
65
+ result
66
+ end
67
+
68
+
69
+ def to_yaml
70
+ @metadata.to_yaml
71
+ end
72
+
73
+
74
+ def to_array
75
+ @metadata
76
+ end
77
+
78
+
79
+ def to_text
80
+ result = ""
81
+ @metadata.each do |site|
82
+ next unless site[:result]
83
+ result << "Site: #{site[:name]}\n"
84
+ result << "URL: #{site[:url]}\n"
85
+ result << "\n"
86
+ site[:result].each do |entity|
87
+ result << entity.to_text
88
+ end
89
+ result << "\n"
90
+ end
91
+ result
92
+ end
93
+ alias :to_str :to_text
94
+
95
+ end
96
+
97
+ end # MathMetadata
@@ -0,0 +1,221 @@
1
+ # -*-: coding: utf-8 -*-
2
+ # vi: fenc=utf-8:expandtab:ts=2:sw=2:sts=2
3
+
4
+ require 'htmlentities'
5
+ require 'open-uri'
6
+ require 'i18n'
7
+ require 'cgi'
8
+
9
+
10
+ module MathMetadata
11
+
12
+ SITES = []
13
+
14
+ # Abstract class. Inherit in your sites definition.
15
+ class Site
16
+
17
+ def initialize( opts={} )
18
+ @options = { :verbose => true }.merge(opts)
19
+ end
20
+
21
+ # register new site class
22
+ def self.inherited( site )
23
+ SITES << site
24
+ end
25
+
26
+
27
+ # search for authors
28
+ def author( args={} )
29
+ opts = {:name => nil}.merge(args)
30
+ anf = author_name_forms opts[:name]
31
+
32
+ authors = []
33
+ anf.each do |af|
34
+ entry = Author.new({:id => af[1], :preferred => af[0], :forms => af[2]})
35
+ authors << entry unless entry[:id].to_s.strip.empty?
36
+ end
37
+
38
+ authors
39
+ end
40
+
41
+
42
+ # search for articles
43
+ def article( args={} )
44
+ opts = {:id => nil, :title => "", :year => "", :authors => [], :references => true}.merge(args)
45
+
46
+ page = fetch_article(opts)
47
+ articles = []
48
+
49
+ return metadata unless page
50
+
51
+ if list_of_articles?(page)
52
+ articles = get_article_list(page)
53
+ else
54
+ a = get_article(page, opts)
55
+ articles << a unless a[:title].to_s.strip.empty?
56
+ end
57
+
58
+ return nil if articles.size == 0
59
+ articles
60
+ end
61
+
62
+
63
+ protected
64
+
65
+
66
+ def method_missing(meth, *args)
67
+ page = args.first
68
+
69
+ case meth.to_s
70
+ when /^list_of_(.*)\?$/
71
+ re = eval("self.class::LIST_OF_#{$1.upcase}_RE")
72
+ return page =~ re
73
+ when /^get_(.*)_m$/
74
+ re = eval("self.class::#{$1.upcase}_RE")
75
+ re_s = eval("self.class::#{$1.upcase}S_RE")
76
+ m, n = args[1,2]
77
+ m ||= 1
78
+ n ||= 1
79
+ res = []
80
+ page.scan(re_s) do |match|
81
+ entry = []
82
+ m.times {|i| entry << match[i].to_s.strip}
83
+ entry << []
84
+ match[m].scan(re) do |form|
85
+ n.times {|i| entry[m] << form[i]}
86
+ end if match[m]
87
+ res << entry
88
+ end
89
+ return res
90
+
91
+ when /^get_(.*)_s$/
92
+ res = []
93
+ what = $1
94
+ re = eval("self.class::#{what.upcase}_RE")
95
+ re_s = eval("self.class::#{what.upcase}S_RE")
96
+ page =~ re_s
97
+ entries = $1
98
+ entries.to_s.strip.scan(re) do |match|
99
+ res << match[0].to_s.strip
100
+ end
101
+ return res
102
+
103
+ when /^get_(.*)$/
104
+ match = eval("self.class::#{$1.upcase}_RE").match(page).to_a.map{|x| x.to_s.strip}
105
+ match.shift
106
+ return match.first if args[1].to_i <= 1
107
+ return match
108
+ end
109
+ end
110
+
111
+
112
+ # search for author name forms
113
+ def author_name_forms( name )
114
+ forms = []
115
+
116
+ page = fetch_author name
117
+ forms = get_author_m page, 2, 1
118
+
119
+ forms
120
+ end
121
+
122
+
123
+ def get_article_references( page )
124
+ references = []
125
+
126
+ refs = get_article_reference_s page
127
+
128
+ i = 0;
129
+ refs.each do |r|
130
+ i+=1
131
+ ref = Reference.new r.gsub(/<.*?>/,'').gsub(/ +/,' ').strip, i
132
+ references << ref
133
+ end
134
+
135
+ references
136
+ end
137
+
138
+
139
+ def get_article_msc( page )
140
+ mscs = get_article_msc_s page
141
+ mscs = MathMetadata.normalize_mscs(mscs)
142
+ mscs
143
+ end
144
+
145
+
146
+ def get_article( page, opts={} )
147
+ a = Article.new( {
148
+ :id => get_article_id(page),
149
+ :authors => get_article_author_s(page),
150
+ :msc => get_article_msc(page),
151
+ :publication => get_article_publication(page),
152
+ :range => MathMetadata.normalize_range(get_article_range(page)),
153
+ :year => get_article_year(page),
154
+ :keywords => get_article_keyword_s(page),
155
+ :issn => get_article_issn_s(page)
156
+ } )
157
+
158
+ a.title, a.language = get_article_title(page, 2)
159
+ a.title = a.title.to_s.gsub(/<\/span>/,'')
160
+ a.references = get_article_references(page) if opts[:references]
161
+
162
+ a
163
+ end
164
+
165
+
166
+ def get_article_list( page )
167
+ articles = []
168
+ page.scan(self.class::ARTICLE_ENTRY_RE).each do |match|
169
+ a = article(:id => match[0]).first
170
+ articles << a unless a[:title].to_s.strip.empty?
171
+ end
172
+ articles
173
+ end
174
+
175
+
176
+ def nwords(s)
177
+ s.split(" ")[0...@options[:nwords].to_i].join(" ")
178
+ end
179
+
180
+
181
+ def fetch_page( url, args={} )
182
+ opts = {:entities => true}.merge(args)
183
+
184
+ puts "fetching #{url}" if @options[:verbose]
185
+ page = URI.parse(url).read
186
+ page = HTMLEntities.decode_entities(page) if page and opts[:entities]
187
+
188
+ page
189
+ end
190
+
191
+
192
+ def fetch_author( name )
193
+ nn = MathMetadata.normalize_name(name)
194
+ url = self.class::AUTHOR_URL % URI.escape(nn)
195
+
196
+ fetch_page(url)
197
+ end
198
+
199
+
200
+ def join_article_authors( authors )
201
+ authors.collect { |author| URI.escape MathMetadata.normalize_name(author) }.join('; ') || ''
202
+ end
203
+
204
+ def fetch_article( args={} )
205
+ opts = {:id => nil, :title => "", :year => "", :authors => []}.merge(args)
206
+ url = self.class::ARTICLE_ID_URL % URI.escape(opts[:id].to_s.strip)
207
+ if opts[:id].to_s.strip.empty?
208
+ author = join_article_authors opts[:authors]
209
+ title = opts[:title]
210
+ title = '' if not title.kind_of?(String)
211
+ title = MathMetadata.normalize_text(title)
212
+ title = nwords(title) if @options[:nwords]
213
+ url = self.class::ARTICLE_URL % [URI.escape(title), author, opts[:year].to_s]
214
+ end
215
+
216
+ fetch_page(url, opts)
217
+ end
218
+
219
+ end # Site
220
+
221
+ end # Module