math_metadata_lookup 0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,67 @@
1
+ # -*-: coding: utf-8 -*-
2
+ # vi: fenc=utf-8:expandtab:ts=2:sw=2:sts=2
3
+
4
+ module MathMetadata
5
+
6
+ # Mathematical Reviews
7
+ # http://www.ams.org/mr-database
8
+ class MR < Site
9
+ ID = :mr
10
+ NAME = "MathSciNet"
11
+ URL = "http://www.ams.org/mr-database"
12
+
13
+
14
+ # AUTHOR_URL % "Author, Name"
15
+ AUTHOR_URL = %~http://www.ams.org/mathscinet/search/authors.html?authorName=%s&Submit=Search~
16
+
17
+ AUTHORS_RE = %r{<h1 class="profileHead">(.*)<\/h1>.*?<li>\s*MR Author ID:\s*<b>\s*(\d+)\s*</b></li>(?:.*?<ul class="variations">(.*?)<\/ul>)?}mi
18
+ AUTHOR_RE = %r{<li>(.*?)<\/li>}mi
19
+
20
+
21
+ ARTICLE_ID_URL = "http://www.ams.org/msnmain?preferred_language=en&pg3=MR&s3=%s&l=20&reference_lists=show&simple_headlines=full&contributed_items=show&redirect=Providence%%2C+RI+USA&Submit=Start+Search&fn=130&form=basicsearch"
22
+ # ARTICLE_URL = "http://www.ams.org/mathscinet/search/publications.html?pg4=TI&s4=%s&co4=AND&%s&Submit=Search&dr=all&yrop=eq&arg3=%s&dr=pubyear&yearRangeFirst=&yearRangeSecond=&pg8=ET&s8=All&review_format=html"
23
+ #ARTICLE_URL = "http://www.ams.org/mathscinet/search/publdoc.html?co4=AND&dr=pubyear&pg4=TI&pg8=ET&r=1&review_format=html&s4=%s&%s&All&vfpref=html&yearRangeFirst=&yearRangeSecond=&yrop=eq&arg3=%s"
24
+ ARTICLE_URL = "http://www.ams.org/mathscinet/search/publications.html?co4=AND&dr=pubyear&pg4=TI&pg8=ET&r=1&review_format=html&s4=%s&%s&All&vfpref=html&yearRangeFirst=&yearRangeSecond=&yrop=eq&arg3=%s"
25
+
26
+ LIST_OF_ARTICLES_RE = %r{<strong>Matches:</strong>\s*\d*}mi
27
+ ARTICLE_ENTRY_RE = %r{<div class="headlineText">\s*<a href="/mathscinet/search/publdoc.html[^"]+">\s*<strong>\s*([^< ]+)\s*</strong>\s*<strong>}mi
28
+
29
+ ARTICLE_ID_RE = %r{<strong>(.*?)</strong>}mi
30
+ ARTICLE_TITLE_RE = %r{<span class="title">(?:<span class="searchHighlight">)?(.*?)</span>\s*(?:<span class="sumlang">\(?(.*?)\)?</span>)?}mi
31
+ ARTICLE_AUTHORS_RE = %r{<br />(<a href="/mathscinet/search/publications.html[^"]*">.*?</a>)<br />}mi
32
+ ARTICLE_AUTHOR_RE = %r{<a href="/mathscinet/search/publications.html[^"]*">(.*?)</a>}mi
33
+ ARTICLE_MSCS_RE = %r{<a href="/mathscinet/search/mscdoc.html\?code=[^"]*">(.*?)</a>}mi
34
+ ARTICLE_MSC_RE = %r{([^, ]+)}mi
35
+ ARTICLE_PUBLICATION_RE = %r{<a href="/mathscinet/search/journaldoc\.html\?cn=[^"]*">\s*<em>(.*?)</em>\s*</a>}mi
36
+ ARTICLE_RANGE_RE = %r{(\d+–\d+)}mi
37
+ ARTICLE_YEAR_RE = %r{<a href="/mathscinet/search/publications\.html[^"]*">\s*\(?(\d{4})\)?, </a>}mi
38
+ ARTICLE_ISSNS_RE = %r{(ISSN.*?)<br>}mi
39
+ ARTICLE_ISSN_RE = %r{ISSN\s*(.........)}mi
40
+ ARTICLE_KEYWORDS_RE = %r{<p><i>Keywords:</i>\s*(.*?)\s*</p>}mi
41
+ ARTICLE_KEYWORD_RE = %r{([^;]) ?}mi
42
+ #ARTICLE_REFERENCES_RE = %r{<center>\s*<strong>\s*References\s*</strong>\s*</center>\s*<ol>\s*(.*?)\s*</ol>}mi
43
+ ARTICLE_REFERENCES_RE = %r{<center>\s*<strong>\s*References\s*</strong>\s*</center>\s*<ol>\s*(.*?)\s*</ol>}mi
44
+ ARTICLE_REFERENCE_RE = %r{<li>\s*([^:]+:.*?)\s*</li>}
45
+
46
+ #ARTICLE_REFERENCE_RE = %r{([^:]+):(.*?)\s*<span class="bf">\s*(.*?)\s*<\/span>\s*\((\d+)\)\s*(?:,\s*([^ ]+?)\s*<a href="[^"]+"\s*>\s*([^ ]+)\s*.*?)?}mi
47
+ #
48
+ def join_article_authors( authors )
49
+ i = 4
50
+ authors.collect { |author|
51
+ i += 1
52
+ "pg#{i}=AUCN&s#{i}=#{URI.escape MathMetadata.normalize_name(author)}&co#{i}=AND"
53
+ }.join("&")
54
+ end
55
+
56
+ def get_article_references( page )
57
+ refs = super page
58
+ refs.each do |r|
59
+ r.source =~ /(MR[^\s]+)/
60
+ r.article[:id] = $1
61
+ end
62
+ refs
63
+ end
64
+
65
+ end # MRev
66
+
67
+ end
@@ -0,0 +1,97 @@
1
+ # -*-: coding: utf-8 -*-
2
+ # vi: fenc=utf-8:expandtab:ts=2:sw=2:sts=2
3
+
4
+ require 'rexml/document'
5
+
6
+ module MathMetadata
7
+
8
+ # Zentralblatt
9
+ # http://www.zentralblatt-math.org/zmath/
10
+ class ZBL < Site
11
+ ID = :zbl
12
+ NAME = "Zentralblatt"
13
+ URL = "http://www.zentralblatt-math.org/zmath/"
14
+
15
+
16
+ AUTHOR_URL ="http://www.zentralblatt-math.org/zbmath/authors/?q=%s"
17
+
18
+ AUTHORS_RE = %r{<div class="name">\s*<strong>(.*?)</strong>.*?Author-Id:\s*</div>\s*([^ <]+)\s*</div>.*?<div class="table">\s*<div class="title">Spellings:</div>\s*(.*?)\s*</div>}mi
19
+ AUTHOR_RE = %r{(.*?)\s*\[\d+\](?:;\s*)?}
20
+
21
+
22
+ ARTICLE_ID_URL = "http://www.zentralblatt-math.org/zmath/en/search?q=an:%s&type=xml&format=complete"
23
+ ARTICLE_URL = "http://www.zentralblatt-math.org/zmath/en/search?q=ti:%s%%26%s%%26py:%s&type=xml&format=complete"
24
+
25
+ LIST_OF_ARTICLES_RE = %r{<strong class="middle">Result:</strong>}mi
26
+ ARTICLE_ENTRY_RE = %r{<span[^>]*?>\s*<a href="\?q=an:([^\&]+)\&format=complete">[^<]+</a>\s*<b>}mi
27
+
28
+ ARTICLE_ID_RE = %r{<a href="\?q=an:.*?complete">(.*?)</a>}mi
29
+ ARTICLE_TITLE_RE = %r{</a><br>(.*?)\.</b>\s*\((.*?)\)<br>}mi
30
+ ARTICLE_AUTHORS_RE = %r{<br><b>(<a href="\?q=[^"]*">.*?</a>)<br>}mi
31
+ ARTICLE_AUTHOR_RE = %r{<a href="\?q=[^"]*">(.*?)</a>}mi
32
+ ARTICLE_MSCS_RE = %r{<dd>(.*?)</dd>}mi
33
+ ARTICLE_MSC_RE = %r{<a href=".*?">(.*?)</a>}mi
34
+ ARTICLE_PUBLICATION_RE = %r{<a href="[^"j]*?journals[^"]*">(.*?)</a>}mi
35
+ ARTICLE_RANGE_RE = %r{</a> \d+(?:-\d+)?,\s*(\d+-\d+).*?ISSN}
36
+ ARTICLE_YEAR_RE = %r{</a>\s*\d+-\d+, \d+-\d+ \((\d+)\)\.}mi
37
+ ARTICLE_ISSNS_RE = %r{(ISSN.*?)<br>}mi
38
+ ARTICLE_ISSN_RE = %r{ISSN\s*(.........)}mi
39
+ ARTICLE_KEYWORDS_RE = %r{<p><i>Keywords:</i>\s*(.*?)\s*</p>}mi
40
+ ARTICLE_KEYWORD_RE = %r{([^;]+) ?}mi
41
+ ARTICLE_REFERENCES_RE = %r{<p><i>Citations:</i>\s*(.*?)\s*</p>}
42
+ # 1=authors, 2=journal, 3=volume/issue, 4=year, 5=range, 6=ref
43
+ ARTICLE_REFERENCE_RE = %r{xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx}
44
+ #<p><i>Citations:</i> <a href="?an=0962.76001">Zbl 0962.76001</a>; <a href="?an=0784.46029">Zbl 0784.46029</a>; <a href="?an=0974.46040">Zbl 0974.46040</a></p>
45
+
46
+ def article( args={} )
47
+ opts = {:id => nil, :title => "", :year => "", :authors => [], :references => true, :entities => false}.merge(args)
48
+ page = fetch_article(opts)
49
+ xml = ::REXML::Document.new page
50
+
51
+ articles = []
52
+ xml.elements.each("//zbml/answers/rec") do |element|
53
+ article = Article.new({
54
+ :id => get_element(element, "an"),
55
+ :title => get_element(element, "ti"),
56
+ :authors => get_element(element, "au").split(/;\s*/),
57
+ :language => get_element(element, "la"),
58
+ :msc => normalize_mscs(get_element(element,"cc").split(" ")),
59
+ :year => get_element(element, "py"),
60
+ :keywords => get_element(element, "ut").split(/;\s*/),
61
+ :issn => get_element(element, "is").split(/;\s*/),
62
+ :publication => get_element(element, "so")
63
+ })
64
+ article.references = get_article_references(get_element(element, "ci")) if opts[:references]
65
+ article.publication =~ /,\s*(\d+-\d+)\s*\(\d{4}\)/
66
+ article.range = $1
67
+ articles << article
68
+ end
69
+
70
+ articles
71
+ end
72
+
73
+ protected
74
+
75
+ def get_element(xml, path)
76
+ element = xml.elements.each(path){|e|}.first
77
+ return "" unless element
78
+ CGI::unescapeHTML(element.children.first.to_s.strip)
79
+ end
80
+
81
+ def join_article_authors( authors )
82
+ authors.collect { |author| "au:#{URI.escape author}" }.join("%26")
83
+ end
84
+
85
+ def get_article_references( str )
86
+ ids = str.to_s.split(/;\s*/)
87
+ references = []
88
+ ids.each_with_index do |id, idx|
89
+ references << Reference.new(article(:id => id, :references => false).first, idx+1)
90
+ end
91
+ references
92
+ end
93
+
94
+
95
+ end # ZBL
96
+
97
+ end
@@ -0,0 +1,110 @@
1
+ # -*-: coding: utf-8 -*-
2
+ # vi: fenc=utf-8:expandtab:ts=2:sw=2:sts=2
3
+
4
+ require 'unicode'
5
+
6
+ module MathMetadata
7
+
8
+ class << self
9
+
10
+ def levenshtein_distance( s1, s2 )
11
+ return 1.0 if s1 == s2
12
+
13
+ s1u, s2u = s1.split(//u), s2.split(//u)
14
+ tab = Array.new(s1u.size+1){ Array.new(s2u.size+1){0} }
15
+
16
+ (0..s1u.size).each do |i|
17
+ tab[i][0] = i
18
+ end
19
+ (0..s2u.size).each do |j|
20
+ tab[0][j] = j
21
+ end
22
+
23
+ (1..s2u.size).each do |j|
24
+ (1..s1u.size).each do |i|
25
+ if s2u[j-1] == s1u[i-1]
26
+ tab[i][j] = tab[i-1][j-1]
27
+ else
28
+ tab[i][j] = [
29
+ tab[i-1][j] + 1,
30
+ tab[i][j-1] + 1,
31
+ tab[i-1][j-1] + 1
32
+ ].sort.first
33
+ end
34
+ end
35
+ end
36
+ 1 - (tab.last.last.to_f / ([s1u.size, s2u.size].sort.last))
37
+ end # levenshtein_distance
38
+
39
+
40
+ def normalize_range( range )
41
+ range.to_s.gsub(/–|--/,'-')
42
+ end
43
+
44
+
45
+ def normalize_mscs( mscs )
46
+ mscs.map{|m| m.split(/,|;/) }.flatten.map{|m| m =~ /\s*\(?([^\s\)\(]+)\)?\s*/; $1}
47
+ end
48
+
49
+
50
+ def normalize_name( name )
51
+ # only latin chars
52
+ trans = latex_to_utf8(name.to_s)
53
+ trans = I18n.transliterate(trans)
54
+
55
+ # remove Jr.
56
+ trans.sub! %r{\bjr\.(\b|$)}i, ' '
57
+
58
+ # remove abbr.: Rakosnik, J. => Rakosnik,
59
+ trans.sub! %r{(\W|^)\w\.}i, ' '
60
+
61
+ # transform: Surname, N.M. => Surname, N. M.
62
+ trans.gsub( /([^\s,])?\.([^\s,])/, '\1. \2' )
63
+
64
+ #MathMetadata.remove_punctuation(trans)
65
+ trans
66
+ end
67
+
68
+
69
+ def remove_punctuation( s )
70
+ str = s.gsub %r{(\w)[.,]+( |$)}i, '\1 '
71
+ str.gsub! %r{(\s)[.,]+( |$)}i, '\1 '
72
+ str.strip
73
+ end
74
+
75
+
76
+ def normalize_text( s )
77
+ str = latex_to_utf8(s)
78
+ str = I18n.transliterate(str).downcase
79
+ str = remove_punctuation(str)
80
+ str.gsub!(%r{\W+}, ' ')
81
+ str.gsub!(%r{(?:the|a|of|)\s+}i, ' ')
82
+ str.strip
83
+ end
84
+
85
+ ACCENT_REPL = {
86
+ "`" => "\u0300", # grave accent
87
+ "'" => "\u0301", # acute accent
88
+ "^" => "\u0302", # circumflex
89
+ '"' => "\u0308", # umlaut or dieresis
90
+ "~" => "\u0303", # tilde
91
+ "H" => "\u030b", # long Hungarian umlaut (double acute)
92
+ "c" => "\u0327", # cedilla
93
+ "=" => "\u0304", # macron accent
94
+ "." => "\u0307", # dot over the letter
95
+ "r" => "\u030a", # ring over the letter
96
+ "u" => "\u0306", # breve over the letter
97
+ "v" => "\u030c" # caron/hacek ("v") over the letter
98
+ }
99
+
100
+ def latex_to_utf8( s )
101
+ str = s.gsub( /\\(.)(?:([a-zA-Z])|\{([a-zA-Z])\}|\{\\([a-zA-Z])\})/ ) do |match|
102
+ accent = ACCENT_REPL[$1]
103
+ char = $2 || $3 || $4
104
+ accent ? Unicode.normalize_KC( char + accent ) : match
105
+ end
106
+ end
107
+
108
+ end # <<self
109
+
110
+ end # module
@@ -0,0 +1,29 @@
1
+ # -*- coding: utf-8 -*-
2
+ # vi: fenc=utf-8:expandtab:ts=2:sw=2:sts=2
3
+ #
4
+ # @author: Petr Kovar <pejuko@gmail.com>
5
+
6
+ require 'rubygems'
7
+ require 'find'
8
+
9
+ spec = Gem::Specification.new do |s|
10
+ s.platform = Gem::Platform::RUBY
11
+ s.summary = "Search mathematical reviews sites and fetches metadata about articles."
12
+ s.homepage = "http://github.com/pejuko/math_metadata_lookup"
13
+ s.email = "pejuko@gmail.com"
14
+ s.authors = ["Petr Kovar"]
15
+ s.name = 'math_metadata_lookup'
16
+ s.version = '0.1'
17
+ s.date = Time.now.strftime("%Y-%m-%d")
18
+ s.add_dependency('i18n', '>= 0.5.0')
19
+ s.add_dependency('unicode')
20
+ s.require_path = 'lib'
21
+ s.files = ["bin/math_metadata_lookup", "README.md", "math_metadata_lookup.gemspec", "TODO", "Rakefile"]
22
+ s.files += Dir["lib/**/*.rb"]
23
+ s.executables = ["math_metadata_lookup"]
24
+ s.description = <<EOF
25
+ This utility/library search mathematical reviews sites and fetches metadata about articles.
26
+ It can return results as one of text, xml, html, yaml or ruby formats.
27
+ EOF
28
+ end
29
+
metadata ADDED
@@ -0,0 +1,108 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: math_metadata_lookup
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ version: "0.1"
9
+ platform: ruby
10
+ authors:
11
+ - Petr Kovar
12
+ autorequire:
13
+ bindir: bin
14
+ cert_chain: []
15
+
16
+ date: 2011-01-25 00:00:00 +01:00
17
+ default_executable:
18
+ dependencies:
19
+ - !ruby/object:Gem::Dependency
20
+ name: i18n
21
+ prerelease: false
22
+ requirement: &id001 !ruby/object:Gem::Requirement
23
+ none: false
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ segments:
28
+ - 0
29
+ - 5
30
+ - 0
31
+ version: 0.5.0
32
+ type: :runtime
33
+ version_requirements: *id001
34
+ - !ruby/object:Gem::Dependency
35
+ name: unicode
36
+ prerelease: false
37
+ requirement: &id002 !ruby/object:Gem::Requirement
38
+ none: false
39
+ requirements:
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ segments:
43
+ - 0
44
+ version: "0"
45
+ type: :runtime
46
+ version_requirements: *id002
47
+ description: |
48
+ This utility/library search mathematical reviews sites and fetches metadata about articles.
49
+ It can return results as one of text, xml, html, yaml or ruby formats.
50
+
51
+ email: pejuko@gmail.com
52
+ executables:
53
+ - math_metadata_lookup
54
+ extensions: []
55
+
56
+ extra_rdoc_files: []
57
+
58
+ files:
59
+ - bin/math_metadata_lookup
60
+ - README.md
61
+ - math_metadata_lookup.gemspec
62
+ - TODO
63
+ - Rakefile
64
+ - lib/math_metadata_lookup.rb
65
+ - lib/math_metadata_lookup/site.rb
66
+ - lib/math_metadata_lookup/article.rb
67
+ - lib/math_metadata_lookup/tools.rb
68
+ - lib/math_metadata_lookup/lookup.rb
69
+ - lib/math_metadata_lookup/result.rb
70
+ - lib/math_metadata_lookup/sites/mr.rb
71
+ - lib/math_metadata_lookup/sites/zbl.rb
72
+ - lib/math_metadata_lookup/reference.rb
73
+ - lib/math_metadata_lookup/entity.rb
74
+ - lib/math_metadata_lookup/author.rb
75
+ has_rdoc: true
76
+ homepage: http://github.com/pejuko/math_metadata_lookup
77
+ licenses: []
78
+
79
+ post_install_message:
80
+ rdoc_options: []
81
+
82
+ require_paths:
83
+ - lib
84
+ required_ruby_version: !ruby/object:Gem::Requirement
85
+ none: false
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ segments:
90
+ - 0
91
+ version: "0"
92
+ required_rubygems_version: !ruby/object:Gem::Requirement
93
+ none: false
94
+ requirements:
95
+ - - ">="
96
+ - !ruby/object:Gem::Version
97
+ segments:
98
+ - 0
99
+ version: "0"
100
+ requirements: []
101
+
102
+ rubyforge_project:
103
+ rubygems_version: 1.3.7
104
+ signing_key:
105
+ specification_version: 3
106
+ summary: Search mathematical reviews sites and fetches metadata about articles.
107
+ test_files: []
108
+