math_metadata_lookup 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,67 @@
1
+ # -*-: coding: utf-8 -*-
2
+ # vi: fenc=utf-8:expandtab:ts=2:sw=2:sts=2
3
+
4
+ module MathMetadata
5
+
6
+ # Mathematical Reviews
7
+ # http://www.ams.org/mr-database
8
+ class MR < Site
9
+ ID = :mr
10
+ NAME = "MathSciNet"
11
+ URL = "http://www.ams.org/mr-database"
12
+
13
+
14
+ # AUTHOR_URL % "Author, Name"
15
+ AUTHOR_URL = %~http://www.ams.org/mathscinet/search/authors.html?authorName=%s&Submit=Search~
16
+
17
+ AUTHORS_RE = %r{<h1 class="profileHead">(.*)<\/h1>.*?<li>\s*MR Author ID:\s*<b>\s*(\d+)\s*</b></li>(?:.*?<ul class="variations">(.*?)<\/ul>)?}mi
18
+ AUTHOR_RE = %r{<li>(.*?)<\/li>}mi
19
+
20
+
21
+ ARTICLE_ID_URL = "http://www.ams.org/msnmain?preferred_language=en&pg3=MR&s3=%s&l=20&reference_lists=show&simple_headlines=full&contributed_items=show&redirect=Providence%%2C+RI+USA&Submit=Start+Search&fn=130&form=basicsearch"
22
+ # ARTICLE_URL = "http://www.ams.org/mathscinet/search/publications.html?pg4=TI&s4=%s&co4=AND&%s&Submit=Search&dr=all&yrop=eq&arg3=%s&dr=pubyear&yearRangeFirst=&yearRangeSecond=&pg8=ET&s8=All&review_format=html"
23
+ #ARTICLE_URL = "http://www.ams.org/mathscinet/search/publdoc.html?co4=AND&dr=pubyear&pg4=TI&pg8=ET&r=1&review_format=html&s4=%s&%s&All&vfpref=html&yearRangeFirst=&yearRangeSecond=&yrop=eq&arg3=%s"
24
+ ARTICLE_URL = "http://www.ams.org/mathscinet/search/publications.html?co4=AND&dr=pubyear&pg4=TI&pg8=ET&r=1&review_format=html&s4=%s&%s&All&vfpref=html&yearRangeFirst=&yearRangeSecond=&yrop=eq&arg3=%s"
25
+
26
+ LIST_OF_ARTICLES_RE = %r{<strong>Matches:</strong>\s*\d*}mi
27
+ ARTICLE_ENTRY_RE = %r{<div class="headlineText">\s*<a href="/mathscinet/search/publdoc.html[^"]+">\s*<strong>\s*([^< ]+)\s*</strong>\s*<strong>}mi
28
+
29
+ ARTICLE_ID_RE = %r{<strong>(.*?)</strong>}mi
30
+ ARTICLE_TITLE_RE = %r{<span class="title">(?:<span class="searchHighlight">)?(.*?)</span>\s*(?:<span class="sumlang">\(?(.*?)\)?</span>)?}mi
31
+ ARTICLE_AUTHORS_RE = %r{<br />(<a href="/mathscinet/search/publications.html[^"]*">.*?</a>)<br />}mi
32
+ ARTICLE_AUTHOR_RE = %r{<a href="/mathscinet/search/publications.html[^"]*">(.*?)</a>}mi
33
+ ARTICLE_MSCS_RE = %r{<a href="/mathscinet/search/mscdoc.html\?code=[^"]*">(.*?)</a>}mi
34
+ ARTICLE_MSC_RE = %r{([^, ]+)}mi
35
+ ARTICLE_PUBLICATION_RE = %r{<a href="/mathscinet/search/journaldoc\.html\?cn=[^"]*">\s*<em>(.*?)</em>\s*</a>}mi
36
+ ARTICLE_RANGE_RE = %r{(\d+–\d+)}mi
37
+ ARTICLE_YEAR_RE = %r{<a href="/mathscinet/search/publications\.html[^"]*">\s*\(?(\d{4})\)?, </a>}mi
38
+ ARTICLE_ISSNS_RE = %r{(ISSN.*?)<br>}mi
39
+ ARTICLE_ISSN_RE = %r{ISSN\s*(.........)}mi
40
+ ARTICLE_KEYWORDS_RE = %r{<p><i>Keywords:</i>\s*(.*?)\s*</p>}mi
41
+ ARTICLE_KEYWORD_RE = %r{([^;]) ?}mi
42
+ #ARTICLE_REFERENCES_RE = %r{<center>\s*<strong>\s*References\s*</strong>\s*</center>\s*<ol>\s*(.*?)\s*</ol>}mi
43
+ ARTICLE_REFERENCES_RE = %r{<center>\s*<strong>\s*References\s*</strong>\s*</center>\s*<ol>\s*(.*?)\s*</ol>}mi
44
+ ARTICLE_REFERENCE_RE = %r{<li>\s*([^:]+:.*?)\s*</li>}
45
+
46
+ #ARTICLE_REFERENCE_RE = %r{([^:]+):(.*?)\s*<span class="bf">\s*(.*?)\s*<\/span>\s*\((\d+)\)\s*(?:,\s*([^ ]+?)\s*<a href="[^"]+"\s*>\s*([^ ]+)\s*.*?)?}mi
47
+ #
48
+ def join_article_authors( authors )
49
+ i = 4
50
+ authors.collect { |author|
51
+ i += 1
52
+ "pg#{i}=AUCN&s#{i}=#{URI.escape MathMetadata.normalize_name(author)}&co#{i}=AND"
53
+ }.join("&")
54
+ end
55
+
56
+ def get_article_references( page )
57
+ refs = super page
58
+ refs.each do |r|
59
+ r.source =~ /(MR[^\s]+)/
60
+ r.article[:id] = $1
61
+ end
62
+ refs
63
+ end
64
+
65
+ end # MRev
66
+
67
+ end
@@ -0,0 +1,97 @@
1
+ # -*-: coding: utf-8 -*-
2
+ # vi: fenc=utf-8:expandtab:ts=2:sw=2:sts=2
3
+
4
+ require 'rexml/document'
5
+
6
+ module MathMetadata
7
+
8
+ # Zentralblatt
9
+ # http://www.zentralblatt-math.org/zmath/
10
+ class ZBL < Site
11
+ ID = :zbl
12
+ NAME = "Zentralblatt"
13
+ URL = "http://www.zentralblatt-math.org/zmath/"
14
+
15
+
16
+ AUTHOR_URL ="http://www.zentralblatt-math.org/zbmath/authors/?q=%s"
17
+
18
+ AUTHORS_RE = %r{<div class="name">\s*<strong>(.*?)</strong>.*?Author-Id:\s*</div>\s*([^ <]+)\s*</div>.*?<div class="table">\s*<div class="title">Spellings:</div>\s*(.*?)\s*</div>}mi
19
+ AUTHOR_RE = %r{(.*?)\s*\[\d+\](?:;\s*)?}
20
+
21
+
22
+ ARTICLE_ID_URL = "http://www.zentralblatt-math.org/zmath/en/search?q=an:%s&type=xml&format=complete"
23
+ ARTICLE_URL = "http://www.zentralblatt-math.org/zmath/en/search?q=ti:%s%%26%s%%26py:%s&type=xml&format=complete"
24
+
25
+ LIST_OF_ARTICLES_RE = %r{<strong class="middle">Result:</strong>}mi
26
+ ARTICLE_ENTRY_RE = %r{<span[^>]*?>\s*<a href="\?q=an:([^\&]+)\&format=complete">[^<]+</a>\s*<b>}mi
27
+
28
+ ARTICLE_ID_RE = %r{<a href="\?q=an:.*?complete">(.*?)</a>}mi
29
+ ARTICLE_TITLE_RE = %r{</a><br>(.*?)\.</b>\s*\((.*?)\)<br>}mi
30
+ ARTICLE_AUTHORS_RE = %r{<br><b>(<a href="\?q=[^"]*">.*?</a>)<br>}mi
31
+ ARTICLE_AUTHOR_RE = %r{<a href="\?q=[^"]*">(.*?)</a>}mi
32
+ ARTICLE_MSCS_RE = %r{<dd>(.*?)</dd>}mi
33
+ ARTICLE_MSC_RE = %r{<a href=".*?">(.*?)</a>}mi
34
+ ARTICLE_PUBLICATION_RE = %r{<a href="[^"j]*?journals[^"]*">(.*?)</a>}mi
35
+ ARTICLE_RANGE_RE = %r{</a> \d+(?:-\d+)?,\s*(\d+-\d+).*?ISSN}
36
+ ARTICLE_YEAR_RE = %r{</a>\s*\d+-\d+, \d+-\d+ \((\d+)\)\.}mi
37
+ ARTICLE_ISSNS_RE = %r{(ISSN.*?)<br>}mi
38
+ ARTICLE_ISSN_RE = %r{ISSN\s*(.........)}mi
39
+ ARTICLE_KEYWORDS_RE = %r{<p><i>Keywords:</i>\s*(.*?)\s*</p>}mi
40
+ ARTICLE_KEYWORD_RE = %r{([^;]+) ?}mi
41
+ ARTICLE_REFERENCES_RE = %r{<p><i>Citations:</i>\s*(.*?)\s*</p>}
42
+ # 1=authors, 2=journal, 3=volume/issue, 4=year, 5=range, 6=ref
43
+ ARTICLE_REFERENCE_RE = %r{xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx}
44
+ #<p><i>Citations:</i> <a href="?an=0962.76001">Zbl 0962.76001</a>; <a href="?an=0784.46029">Zbl 0784.46029</a>; <a href="?an=0974.46040">Zbl 0974.46040</a></p>
45
+
46
+ def article( args={} )
47
+ opts = {:id => nil, :title => "", :year => "", :authors => [], :references => true, :entities => false}.merge(args)
48
+ page = fetch_article(opts)
49
+ xml = ::REXML::Document.new page
50
+
51
+ articles = []
52
+ xml.elements.each("//zbml/answers/rec") do |element|
53
+ article = Article.new({
54
+ :id => get_element(element, "an"),
55
+ :title => get_element(element, "ti"),
56
+ :authors => get_element(element, "au").split(/;\s*/),
57
+ :language => get_element(element, "la"),
58
+ :msc => normalize_mscs(get_element(element,"cc").split(" ")),
59
+ :year => get_element(element, "py"),
60
+ :keywords => get_element(element, "ut").split(/;\s*/),
61
+ :issn => get_element(element, "is").split(/;\s*/),
62
+ :publication => get_element(element, "so")
63
+ })
64
+ article.references = get_article_references(get_element(element, "ci")) if opts[:references]
65
+ article.publication =~ /,\s*(\d+-\d+)\s*\(\d{4}\)/
66
+ article.range = $1
67
+ articles << article
68
+ end
69
+
70
+ articles
71
+ end
72
+
73
+ protected
74
+
75
+ def get_element(xml, path)
76
+ element = xml.elements.each(path){|e|}.first
77
+ return "" unless element
78
+ CGI::unescapeHTML(element.children.first.to_s.strip)
79
+ end
80
+
81
+ def join_article_authors( authors )
82
+ authors.collect { |author| "au:#{URI.escape author}" }.join("%26")
83
+ end
84
+
85
+ def get_article_references( str )
86
+ ids = str.to_s.split(/;\s*/)
87
+ references = []
88
+ ids.each_with_index do |id, idx|
89
+ references << Reference.new(article(:id => id, :references => false).first, idx+1)
90
+ end
91
+ references
92
+ end
93
+
94
+
95
+ end # ZBL
96
+
97
+ end
@@ -0,0 +1,110 @@
1
+ # -*-: coding: utf-8 -*-
2
+ # vi: fenc=utf-8:expandtab:ts=2:sw=2:sts=2
3
+
4
+ require 'unicode'
5
+
6
+ module MathMetadata
7
+
8
+ class << self
9
+
10
+ def levenshtein_distance( s1, s2 )
11
+ return 1.0 if s1 == s2
12
+
13
+ s1u, s2u = s1.split(//u), s2.split(//u)
14
+ tab = Array.new(s1u.size+1){ Array.new(s2u.size+1){0} }
15
+
16
+ (0..s1u.size).each do |i|
17
+ tab[i][0] = i
18
+ end
19
+ (0..s2u.size).each do |j|
20
+ tab[0][j] = j
21
+ end
22
+
23
+ (1..s2u.size).each do |j|
24
+ (1..s1u.size).each do |i|
25
+ if s2u[j-1] == s1u[i-1]
26
+ tab[i][j] = tab[i-1][j-1]
27
+ else
28
+ tab[i][j] = [
29
+ tab[i-1][j] + 1,
30
+ tab[i][j-1] + 1,
31
+ tab[i-1][j-1] + 1
32
+ ].sort.first
33
+ end
34
+ end
35
+ end
36
+ 1 - (tab.last.last.to_f / ([s1u.size, s2u.size].sort.last))
37
+ end # levenshtein_distance
38
+
39
+
40
+ def normalize_range( range )
41
+ range.to_s.gsub(/–|--/,'-')
42
+ end
43
+
44
+
45
+ def normalize_mscs( mscs )
46
+ mscs.map{|m| m.split(/,|;/) }.flatten.map{|m| m =~ /\s*\(?([^\s\)\(]+)\)?\s*/; $1}
47
+ end
48
+
49
+
50
+ def normalize_name( name )
51
+ # only latin chars
52
+ trans = latex_to_utf8(name.to_s)
53
+ trans = I18n.transliterate(trans)
54
+
55
+ # remove Jr.
56
+ trans.sub! %r{\bjr\.(\b|$)}i, ' '
57
+
58
+ # remove abbr.: Rakosnik, J. => Rakosnik,
59
+ trans.sub! %r{(\W|^)\w\.}i, ' '
60
+
61
+ # transform: Surname, N.M. => Surname, N. M.
62
+ trans.gsub( /([^\s,])?\.([^\s,])/, '\1. \2' )
63
+
64
+ #MathMetadata.remove_punctuation(trans)
65
+ trans
66
+ end
67
+
68
+
69
+ def remove_punctuation( s )
70
+ str = s.gsub %r{(\w)[.,]+( |$)}i, '\1 '
71
+ str.gsub! %r{(\s)[.,]+( |$)}i, '\1 '
72
+ str.strip
73
+ end
74
+
75
+
76
+ def normalize_text( s )
77
+ str = latex_to_utf8(s)
78
+ str = I18n.transliterate(str).downcase
79
+ str = remove_punctuation(str)
80
+ str.gsub!(%r{\W+}, ' ')
81
+ str.gsub!(%r{(?:the|a|of|)\s+}i, ' ')
82
+ str.strip
83
+ end
84
+
85
+ ACCENT_REPL = {
86
+ "`" => "\u0300", # grave accent
87
+ "'" => "\u0301", # acute accent
88
+ "^" => "\u0302", # circumflex
89
+ '"' => "\u0308", # umlaut or dieresis
90
+ "~" => "\u0303", # tilde
91
+ "H" => "\u030b", # long Hungarian umlaut (double acute)
92
+ "c" => "\u0327", # cedilla
93
+ "=" => "\u0304", # macron accent
94
+ "." => "\u0307", # dot over the letter
95
+ "r" => "\u030a", # ring over the letter
96
+ "u" => "\u0306", # breve over the letter
97
+ "v" => "\u030c" # caron/hacek ("v") over the letter
98
+ }
99
+
100
+ def latex_to_utf8( s )
101
+ str = s.gsub( /\\(.)(?:([a-zA-Z])|\{([a-zA-Z])\}|\{\\([a-zA-Z])\})/ ) do |match|
102
+ accent = ACCENT_REPL[$1]
103
+ char = $2 || $3 || $4
104
+ accent ? Unicode.normalize_KC( char + accent ) : match
105
+ end
106
+ end
107
+
108
+ end # <<self
109
+
110
+ end # module
@@ -0,0 +1,29 @@
1
+ # -*- coding: utf-8 -*-
2
+ # vi: fenc=utf-8:expandtab:ts=2:sw=2:sts=2
3
+ #
4
+ # @author: Petr Kovar <pejuko@gmail.com>
5
+
6
+ require 'rubygems'
7
+ require 'find'
8
+
9
+ spec = Gem::Specification.new do |s|
10
+ s.platform = Gem::Platform::RUBY
11
+ s.summary = "Search mathematical reviews sites and fetches metadata about articles."
12
+ s.homepage = "http://github.com/pejuko/math_metadata_lookup"
13
+ s.email = "pejuko@gmail.com"
14
+ s.authors = ["Petr Kovar"]
15
+ s.name = 'math_metadata_lookup'
16
+ s.version = '0.1'
17
+ s.date = Time.now.strftime("%Y-%m-%d")
18
+ s.add_dependency('i18n', '>= 0.5.0')
19
+ s.add_dependency('unicode')
20
+ s.require_path = 'lib'
21
+ s.files = ["bin/math_metadata_lookup", "README.md", "math_metadata_lookup.gemspec", "TODO", "Rakefile"]
22
+ s.files += Dir["lib/**/*.rb"]
23
+ s.executables = ["math_metadata_lookup"]
24
+ s.description = <<EOF
25
+ This utility/library search mathematical reviews sites and fetches metadata about articles.
26
+ It can return results as one of text, xml, html, yaml or ruby formats.
27
+ EOF
28
+ end
29
+
metadata ADDED
@@ -0,0 +1,108 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: math_metadata_lookup
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ version: "0.1"
9
+ platform: ruby
10
+ authors:
11
+ - Petr Kovar
12
+ autorequire:
13
+ bindir: bin
14
+ cert_chain: []
15
+
16
+ date: 2011-01-25 00:00:00 +01:00
17
+ default_executable:
18
+ dependencies:
19
+ - !ruby/object:Gem::Dependency
20
+ name: i18n
21
+ prerelease: false
22
+ requirement: &id001 !ruby/object:Gem::Requirement
23
+ none: false
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ segments:
28
+ - 0
29
+ - 5
30
+ - 0
31
+ version: 0.5.0
32
+ type: :runtime
33
+ version_requirements: *id001
34
+ - !ruby/object:Gem::Dependency
35
+ name: unicode
36
+ prerelease: false
37
+ requirement: &id002 !ruby/object:Gem::Requirement
38
+ none: false
39
+ requirements:
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ segments:
43
+ - 0
44
+ version: "0"
45
+ type: :runtime
46
+ version_requirements: *id002
47
+ description: |
48
+ This utility/library search mathematical reviews sites and fetches metadata about articles.
49
+ It can return results as one of text, xml, html, yaml or ruby formats.
50
+
51
+ email: pejuko@gmail.com
52
+ executables:
53
+ - math_metadata_lookup
54
+ extensions: []
55
+
56
+ extra_rdoc_files: []
57
+
58
+ files:
59
+ - bin/math_metadata_lookup
60
+ - README.md
61
+ - math_metadata_lookup.gemspec
62
+ - TODO
63
+ - Rakefile
64
+ - lib/math_metadata_lookup.rb
65
+ - lib/math_metadata_lookup/site.rb
66
+ - lib/math_metadata_lookup/article.rb
67
+ - lib/math_metadata_lookup/tools.rb
68
+ - lib/math_metadata_lookup/lookup.rb
69
+ - lib/math_metadata_lookup/result.rb
70
+ - lib/math_metadata_lookup/sites/mr.rb
71
+ - lib/math_metadata_lookup/sites/zbl.rb
72
+ - lib/math_metadata_lookup/reference.rb
73
+ - lib/math_metadata_lookup/entity.rb
74
+ - lib/math_metadata_lookup/author.rb
75
+ has_rdoc: true
76
+ homepage: http://github.com/pejuko/math_metadata_lookup
77
+ licenses: []
78
+
79
+ post_install_message:
80
+ rdoc_options: []
81
+
82
+ require_paths:
83
+ - lib
84
+ required_ruby_version: !ruby/object:Gem::Requirement
85
+ none: false
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ segments:
90
+ - 0
91
+ version: "0"
92
+ required_rubygems_version: !ruby/object:Gem::Requirement
93
+ none: false
94
+ requirements:
95
+ - - ">="
96
+ - !ruby/object:Gem::Version
97
+ segments:
98
+ - 0
99
+ version: "0"
100
+ requirements: []
101
+
102
+ rubyforge_project:
103
+ rubygems_version: 1.3.7
104
+ signing_key:
105
+ specification_version: 3
106
+ summary: Search mathematical reviews sites and fetches metadata about articles.
107
+ test_files: []
108
+