scrapes 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +22 -0
- data/README +123 -0
- data/demo/demo.rb +33 -0
- data/demo/pages/about.rb +32 -0
- data/demo/pages/main.rb +32 -0
- data/lib/scrapes.rb +41 -0
- data/lib/scrapes/cache.rb +110 -0
- data/lib/scrapes/cookbook.rb +53 -0
- data/lib/scrapes/cookies.rb +45 -0
- data/lib/scrapes/crawler.rb +97 -0
- data/lib/scrapes/hpricot.rb +110 -0
- data/lib/scrapes/initializer.rb +86 -0
- data/lib/scrapes/page.rb +319 -0
- data/lib/scrapes/rule_parser.rb +327 -0
- data/lib/scrapes/session.rb +155 -0
- data/lib/scrapes/to_proxy.rb +50 -0
- data/test/cache.rb +75 -0
- data/test/cookies.rb +34 -0
- data/test/crawler.rb +69 -0
- data/test/hpricot.rb +55 -0
- data/test/initializer.rb +54 -0
- data/test/lib/server.rb +63 -0
- data/test/page.rb +77 -0
- data/test/pages/foils.rb +61 -0
- data/test/pages/foils2.rb +38 -0
- data/test/pages/redhanded_entries.rb +36 -0
- data/test/pages/redhanded_main.rb +58 -0
- data/test/pages/rule_parser.rb +81 -0
- data/test/pages/simple.rb +21 -0
- data/test/public/foil72.html +10 -0
- data/test/public/foil73.html +9 -0
- data/test/public/foil74.html +11 -0
- data/test/public/foo.txt +1 -0
- data/test/public/index.html +20 -0
- data/test/public/redhanded.html +1208 -0
- data/test/public/rule_parser.html +21 -0
- data/test/public/simple.html +8 -0
- data/test/rule_parser.rb +151 -0
- data/test/session.rb +45 -0
- data/test/textcontent.rb +71 -0
- metadata +123 -0
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
== License
|
2
|
+
|
3
|
+
Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
= Scrapes
|
2
|
+
|
3
|
+
Scrapes is a framework for crawling and scraping multi-page web sites.
|
4
|
+
|
5
|
+
Unlike other scraping frameworks, Scrapes is designed to work with "dirty"
|
6
|
+
web sites. That is, web sites that were not designed to have their data
|
7
|
+
extracted programmatically.
|
8
|
+
|
9
|
+
It includes features for both the initial development of a scraper, and the
|
10
|
+
continued maintenance of that scraper. These features include:
|
11
|
+
|
12
|
+
* Rule based selection and extraction of data that can use CSS selectors or
|
13
|
+
pseudo XPath expressions
|
14
|
+
* Caching system so that during development you don't have to continuously
|
15
|
+
download pages from a web server while you experiment with your selectors and
|
16
|
+
extractors
|
17
|
+
* Validation system that helps detect web site changes that would
|
18
|
+
otherwise invalidate your extraction rules
|
19
|
+
* Support for initiating a session with the web server, and passing session
|
20
|
+
cookies back to the web server
|
21
|
+
* When all else fails, you can run a web page through the xsltproc XSLT
|
22
|
+
processor to generate an XML document that can then be run through your
|
23
|
+
rule based parser
|
24
|
+
* Useful set of post-processing methods such as normalize_name
|
25
|
+
|
26
|
+
|
27
|
+
== Installing Scrapes
|
28
|
+
|
29
|
+
gem install scrapes --include-dependencies
|
30
|
+
|
31
|
+
|
32
|
+
== Dependencies
|
33
|
+
|
34
|
+
* Hpricot: http://code.whytheluckystiff.net/hpricot/wiki/AnHpricotShowcase
|
35
|
+
* Rextra: http://rubyforge.org/projects/rextra2/
|
36
|
+
|
37
|
+
|
38
|
+
== Quick Start
|
39
|
+
|
40
|
+
You start by writing a class for parsing a single page:
|
41
|
+
|
42
|
+
# process the Google.com index.html page
|
43
|
+
class GoogleMain < Scrapes::Page
|
44
|
+
# make sure that the :about_link rule matched the web page
|
45
|
+
validates_presence_of(:about_link)
|
46
|
+
|
47
|
+
# extract the link to the about page
|
48
|
+
rule(:about_link, 'a[@href*="about"]', '@href', 1)
|
49
|
+
end
|
50
|
+
|
51
|
+
# process the Google.com about page
|
52
|
+
class GoogleAbout < Scrapes::Page
|
53
|
+
# ensure the :title rule below matches the web page
|
54
|
+
validates_presence_of(:title)
|
55
|
+
|
56
|
+
# extract the text inside the <title></title> tag
|
57
|
+
rule(:title, 'title', 'text()', 1)
|
58
|
+
end
|
59
|
+
|
60
|
+
Then you start a scraping session and use those classes to process the web
|
61
|
+
site:
|
62
|
+
|
63
|
+
Scrapes::Session.start do |session|
|
64
|
+
session.page(GoogleMain, 'http://google.com') do |main_page|
|
65
|
+
session.page(GoogleAbout, main_page.about_link) do |about_page|
|
66
|
+
puts about_page.title + ': ' + session.absolute_uri(main_page.about_link)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
On my machine, this code produces:
|
72
|
+
About Google: http://www.google.com/intl/en/about.html
|
73
|
+
|
74
|
+
For more information, please review the following classes:
|
75
|
+
* Scrapes::Session
|
76
|
+
* Scrapes::Page
|
77
|
+
* Scrapes::RuleParser
|
78
|
+
* Scrapes::Hpricot::Extractors
|
79
|
+
|
80
|
+
|
81
|
+
== Development Tips
|
82
|
+
|
83
|
+
=== Add something like this to your .irbrc:
|
84
|
+
|
85
|
+
require 'rubygems'
|
86
|
+
require 'yaml'
|
87
|
+
require 'open-uri'
|
88
|
+
require 'hpricot'
|
89
|
+
require 'scrapes'
|
90
|
+
def h(url) Hpricot(open(url)) end
|
91
|
+
Then use like this in irb to understand how Hpricot selectors work:
|
92
|
+
doc = h 'http://www.foobar.com/'
|
93
|
+
links = doc.search('table/a[@href]') # for example
|
94
|
+
To understand the text extractors:
|
95
|
+
texts(links)
|
96
|
+
word(links.first) # etc..
|
97
|
+
|
98
|
+
|
99
|
+
=== Converting normal Xpath to Hpricot Xpath, sort of:
|
100
|
+
|
101
|
+
There are various add-ons to firefox, for example, that display the Xpath to a
|
102
|
+
selected node. Hpricot uses a different sytanx however,
|
103
|
+
(http://code.whytheluckystiff.net/hpricot/wiki/SupportedXpathExpressions). The
|
104
|
+
following method is a first try at the conversion:
|
105
|
+
def xpath_to_hpricot path
|
106
|
+
path.split('/').reject{|e|e=~/^(html|tbody)$/ or e.blank?}.map do |e|
|
107
|
+
res = e.sub(/\[/,':eq(').sub(/\]/,')')
|
108
|
+
res.sub(/\d+/, (/(\d+)/.match(res).to_s.to_i - 1).to_s)
|
109
|
+
end.join('//')
|
110
|
+
end
|
111
|
+
|
112
|
+
=== Hpricot bugs
|
113
|
+
|
114
|
+
* This selector will hang, 'a[href="this"]' and this one won't, 'a[@href="this"]'.
|
115
|
+
Just make sure you have the '@' in front of the attribute name.
|
116
|
+
|
117
|
+
|
118
|
+
== Credits
|
119
|
+
|
120
|
+
* Peter Jones, author and maintainer
|
121
|
+
* Michael Garriss, author and maintainer
|
122
|
+
* Bob Showalter, continuous improvements and maintenance
|
123
|
+
* Assaf Arkin, rule inspiration from http://trac.labnotes.org/cgi-bin/trac.cgi/wiki/Ruby/MicroformatParser
|
data/demo/demo.rb
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
################################################################################
|
2
|
+
#
|
3
|
+
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
# a copy of this software and associated documentation files (the
|
7
|
+
# "Software"), to deal in the Software without restriction, including
|
8
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
# the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be
|
14
|
+
# included in all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
#
|
24
|
+
################################################################################
|
25
|
+
require 'scrapes'
|
26
|
+
|
27
|
+
Scrapes::Session.start do |session|
|
28
|
+
session.page(GoogleMain, 'http://google.com') do |main_page|
|
29
|
+
session.page(GoogleAbout, main_page.about_link) do |about_page|
|
30
|
+
puts about_page.title + ': ' + session.absolute_uri(main_page.about_link)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
data/demo/pages/about.rb
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
################################################################################
|
2
|
+
#
|
3
|
+
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
# a copy of this software and associated documentation files (the
|
7
|
+
# "Software"), to deal in the Software without restriction, including
|
8
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
# the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be
|
14
|
+
# included in all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
#
|
24
|
+
################################################################################
|
25
|
+
class GoogleAbout < Scrapes::Page
|
26
|
+
# ensure the :title rule below matches the web page
|
27
|
+
validates_presence_of(:title)
|
28
|
+
|
29
|
+
# extract the text inside the <title></title> tag
|
30
|
+
rule(:title, 'title', 'text()', 1)
|
31
|
+
|
32
|
+
end
|
data/demo/pages/main.rb
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
################################################################################
|
2
|
+
#
|
3
|
+
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
# a copy of this software and associated documentation files (the
|
7
|
+
# "Software"), to deal in the Software without restriction, including
|
8
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
# the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be
|
14
|
+
# included in all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
#
|
24
|
+
################################################################################
|
25
|
+
class GoogleMain < Scrapes::Page
|
26
|
+
# make sure that the :about_link rule matched the web page
|
27
|
+
validates_presence_of(:about_link)
|
28
|
+
|
29
|
+
# extract the link to the about page
|
30
|
+
rule(:about_link, 'a[@href*="about"]', '@href', 1)
|
31
|
+
|
32
|
+
end
|
data/lib/scrapes.rb
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
################################################################################
|
2
|
+
#
|
3
|
+
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
# a copy of this software and associated documentation files (the
|
7
|
+
# "Software"), to deal in the Software without restriction, including
|
8
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
# the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be
|
14
|
+
# included in all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
#
|
24
|
+
################################################################################
|
25
|
+
require 'rubygems'
|
26
|
+
################################################################################
|
27
|
+
require 'scrapes/initializer'
|
28
|
+
require 'scrapes/to_proxy'
|
29
|
+
require 'scrapes/page'
|
30
|
+
require 'scrapes/cookies'
|
31
|
+
require 'scrapes/cache'
|
32
|
+
require 'scrapes/crawler'
|
33
|
+
require 'scrapes/session'
|
34
|
+
require 'scrapes/cookbook'
|
35
|
+
################################################################################
|
36
|
+
include Scrapes::Hpricot::Extractors
|
37
|
+
################################################################################
|
38
|
+
Scrapes::Initializer.run do |initializer|
|
39
|
+
initializer.process
|
40
|
+
end
|
41
|
+
################################################################################
|
@@ -0,0 +1,110 @@
|
|
1
|
+
################################################################################
|
2
|
+
#
|
3
|
+
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
# a copy of this software and associated documentation files (the
|
7
|
+
# "Software"), to deal in the Software without restriction, including
|
8
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
# the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be
|
14
|
+
# included in all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
#
|
24
|
+
################################################################################
|
25
|
+
require 'ostruct'
|
26
|
+
require 'digest/md5'
|
27
|
+
################################################################################
|
28
|
+
module Scrapes
|
29
|
+
################################################################################
|
30
|
+
# Cache web pages
|
31
|
+
class Cache
|
32
|
+
################################################################################
|
33
|
+
# Enable/disable caching
|
34
|
+
attr_accessor :enabled
|
35
|
+
|
36
|
+
################################################################################
|
37
|
+
# Set the directory to use for caching
|
38
|
+
attr_accessor :directory
|
39
|
+
|
40
|
+
################################################################################
|
41
|
+
# Set to a proc that given an URI, translates it to a file system name
|
42
|
+
# default is to MD5 the URL
|
43
|
+
attr_accessor :uri_translator
|
44
|
+
|
45
|
+
################################################################################
|
46
|
+
def initialize
|
47
|
+
@directory = File.expand_path('cache')
|
48
|
+
@enabled = false
|
49
|
+
@uri_translator = nil
|
50
|
+
end
|
51
|
+
|
52
|
+
################################################################################
|
53
|
+
# Disables caching while the given block is active.
|
54
|
+
def without_cache
|
55
|
+
state = @enabled
|
56
|
+
@enabled = false
|
57
|
+
yield if block_given?
|
58
|
+
@enabled = state
|
59
|
+
end
|
60
|
+
|
61
|
+
################################################################################
|
62
|
+
# Checks the cache to see if there is a match for the given URI
|
63
|
+
def check (uri)
|
64
|
+
return nil unless @enabled
|
65
|
+
|
66
|
+
# FIXME check some time limits around this
|
67
|
+
cache_name = translate_uri(uri)
|
68
|
+
|
69
|
+
file = File.join(@directory, cache_name)
|
70
|
+
File.exist?(file) and File.open(file) do |f|
|
71
|
+
return OpenStruct.new(:body => f.read, :cache_file => file)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
################################################################################
|
76
|
+
# Updates the cache by placing the data for the give URI on the file system.
|
77
|
+
def update (uri, data)
|
78
|
+
return nil unless @enabled
|
79
|
+
|
80
|
+
cache_name = translate_uri(uri)
|
81
|
+
mkdir # FIXME include cache_name to build all necessary directories
|
82
|
+
|
83
|
+
File.open(File.join(@directory, cache_name), 'w') do |f|
|
84
|
+
f << data
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
################################################################################
|
89
|
+
# helper method to translate a URL to a MD5
|
90
|
+
def uri_to_md5 (uri)
|
91
|
+
Digest::MD5.hexdigest(uri.to_s)
|
92
|
+
end
|
93
|
+
|
94
|
+
################################################################################
|
95
|
+
private
|
96
|
+
|
97
|
+
################################################################################
|
98
|
+
def mkdir
|
99
|
+
Dir.mkdir(@directory) unless File.exist?(@directory)
|
100
|
+
end
|
101
|
+
|
102
|
+
################################################################################
|
103
|
+
def translate_uri (uri)
|
104
|
+
@uri_translator ? @uri_translator.call(uri) : uri_to_md5(uri)
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
108
|
+
################################################################################
|
109
|
+
end
|
110
|
+
################################################################################
|
@@ -0,0 +1,53 @@
|
|
1
|
+
################################################################################
|
2
|
+
#
|
3
|
+
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
# a copy of this software and associated documentation files (the
|
7
|
+
# "Software"), to deal in the Software without restriction, including
|
8
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
# the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be
|
14
|
+
# included in all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
#
|
24
|
+
################################################################################
|
25
|
+
module Scrapes
|
26
|
+
################################################################################
|
27
|
+
module RE
|
28
|
+
EMAIL = /[a-zA-Z]([.]?([[:alnum:]_-]+)*)?@([[:alnum:]\-_]+\.)+[a-zA-Z]{2,4}/
|
29
|
+
US_PHONE_NUMBER = /\(?\d{3}[-) ]\s{0,3}\d{3}[- ]\d{4}/
|
30
|
+
end
|
31
|
+
################################################################################
|
32
|
+
def self.normalize_name names
|
33
|
+
require 'unicode'
|
34
|
+
result = names.map do |part|
|
35
|
+
result = ""
|
36
|
+
Unicode.normalize_D(part).each_byte{|byte| (result << byte) if byte < 128}
|
37
|
+
result = result.strip.chomp('.')
|
38
|
+
result.upcase! if result =~ /^(jr|sr|ii+|iv|vi*)$/i
|
39
|
+
result = nil if result =~ /^Rev|Ph\.D$/i
|
40
|
+
result
|
41
|
+
end.compact
|
42
|
+
if result[-2] =~ /,$/
|
43
|
+
extra = result.pop
|
44
|
+
result.unshift result.pop
|
45
|
+
result << extra
|
46
|
+
result.join(' ')
|
47
|
+
else
|
48
|
+
result.unshift(result.pop + ',').join(' ')
|
49
|
+
end
|
50
|
+
end
|
51
|
+
################################################################################
|
52
|
+
end
|
53
|
+
################################################################################
|