wcid 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +4 -0
- data/LICENSE +339 -0
- data/Manifest.txt +18 -0
- data/README.txt +26 -0
- data/Rakefile +123 -0
- data/bin/wcid_ex1.rb +12 -0
- data/examples/wcid_long_ex1.rb +84 -0
- data/lib/wcid.rb +45 -0
- data/lib/wcid/hit.rb +34 -0
- data/lib/wcid/id.rb +80 -0
- data/lib/wcid/marc.rb +69 -0
- data/lib/wcid/search.rb +78 -0
- data/lib/wcid/version.rb +9 -0
- data/scripts/txt2html +67 -0
- data/setup.rb +1585 -0
- data/test/test_helper.rb +2 -0
- data/test/test_wcid.rb +11 -0
- metadata +66 -0
data/bin/wcid_ex1.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
#!/usr/bin/ruby -w
|
2
|
+
require 'wcid'
|
3
|
+
include WCID
|
4
|
+
|
5
|
+
begin
|
6
|
+
authorities = marc_file_to_auths(ARGV[0])
|
7
|
+
rescue
|
8
|
+
puts 'There has been an error! Check that the WorldCat Identities site is up at http://orlabs.oclc.org/Identities/ before complaining to the developer. The site is often down as it is now only in beta.'
|
9
|
+
exit
|
10
|
+
end
|
11
|
+
|
12
|
+
puts authorities
|
@@ -0,0 +1,84 @@
|
|
1
|
+
#!/usr/bin/ruby -w
|
2
|
+
## Jason Ronallo
|
3
|
+
# May 2007
|
4
|
+
|
5
|
+
require 'rubygems'
|
6
|
+
require "rexml/document"
|
7
|
+
require "rexml/xpath"
|
8
|
+
require 'net/http'
|
9
|
+
require 'uri'
|
10
|
+
require 'marc'
|
11
|
+
require 'stringio'
|
12
|
+
require 'worldcatid'
|
13
|
+
|
14
|
+
#file = ''
|
15
|
+
if ARGF.filename == '-'
|
16
|
+
print "Enter the full path to a MARC file: "
|
17
|
+
file = (STDIN.gets).chomp
|
18
|
+
else
|
19
|
+
file = ARGF
|
20
|
+
end
|
21
|
+
|
22
|
+
#wc_search = (WCID::Search.new('Hull, Richard 1945')).search_exact
|
23
|
+
x = 0
|
24
|
+
begin
|
25
|
+
records = MARC::Reader.new(file).to_a
|
26
|
+
rescue MARC::Exception
|
27
|
+
puts "MARC Exception"
|
28
|
+
exit
|
29
|
+
rescue Exception => e
|
30
|
+
puts e
|
31
|
+
print "Enter a valid filename: "
|
32
|
+
file = STDIN.gets.chomp
|
33
|
+
retry
|
34
|
+
end
|
35
|
+
puts "records.length: #{records.length}"
|
36
|
+
#STDIN.gets
|
37
|
+
records.each do | record |
|
38
|
+
fields = record.name_fields
|
39
|
+
puts "fields.length #{fields.length}"
|
40
|
+
#STDIN.gets
|
41
|
+
fields.each do | field |
|
42
|
+
query = field.pnkey_from_marc
|
43
|
+
rec = (WCID::Search.new(query)).search_exact
|
44
|
+
if rec == nil
|
45
|
+
puts "There's no rec for #{query}"
|
46
|
+
puts "Hit enter to continue"
|
47
|
+
STDIN.gets
|
48
|
+
next
|
49
|
+
end
|
50
|
+
puts "class of rec: #{rec.class}"
|
51
|
+
|
52
|
+
lc_retries = 0
|
53
|
+
begin
|
54
|
+
auth = rec.get_lc_auth
|
55
|
+
rescue Timeout::Error, Exception => e
|
56
|
+
#puts e; STDIN.gets
|
57
|
+
if e.to_s == "eof"
|
58
|
+
puts "We'll retry in a long time...."
|
59
|
+
sleep 1000
|
60
|
+
retry
|
61
|
+
end
|
62
|
+
if lc_retries < 5
|
63
|
+
puts "retrying...."
|
64
|
+
lc_retries += 1
|
65
|
+
puts
|
66
|
+
puts "retry #{lc_retries}"
|
67
|
+
retry
|
68
|
+
else
|
69
|
+
puts "you're outta luck on the lc_auth lookup"
|
70
|
+
next
|
71
|
+
end
|
72
|
+
end
|
73
|
+
if auth == nil
|
74
|
+
puts "auth nil"
|
75
|
+
STDIN.gets
|
76
|
+
next
|
77
|
+
end
|
78
|
+
puts auth
|
79
|
+
puts auth.inspect
|
80
|
+
STDIN.gets
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
puts "this is the end of the line"
|
data/lib/wcid.rb
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
#wcid is a ruby library for handling WorldCat Identities
|
2
|
+
#for more information on WorldCat Identities see #http://orlabs.oclc.org/Identities/
|
3
|
+
#
|
4
|
+
#USAGE
|
5
|
+
#
|
6
|
+
# require 'wcid'
|
7
|
+
|
8
|
+
|
9
|
+
require 'rubygems'
|
10
|
+
require 'rexml/document'
|
11
|
+
require 'net/http'
|
12
|
+
require 'uri'
|
13
|
+
require 'marc'
|
14
|
+
require 'stringio'
|
15
|
+
|
16
|
+
require 'wcid/version'
|
17
|
+
|
18
|
+
require 'wcid/marc'
|
19
|
+
require 'wcid/search'
|
20
|
+
require 'wcid/hit'
|
21
|
+
require 'wcid/id'
|
22
|
+
|
23
|
+
module WCID
|
24
|
+
|
25
|
+
#given one or more MARC records
|
26
|
+
#it returns an array of associated name authorities
|
27
|
+
#as MARC::Record objects
|
28
|
+
|
29
|
+
def marc_file_to_auths(filename)
|
30
|
+
authorities = []
|
31
|
+
records = MARC::Reader.new(filename).to_a
|
32
|
+
records.each do | record |
|
33
|
+
fields = record.name_fields
|
34
|
+
fields.each do | field |
|
35
|
+
query = field.pnkey_from_marc
|
36
|
+
wcid_obj = (WCID::Search.new(query)).search_exact
|
37
|
+
next if wcid_obj.nil?
|
38
|
+
auth = wcid_obj.get_lc_auth
|
39
|
+
next if auth.nil?
|
40
|
+
authorities << auth
|
41
|
+
end
|
42
|
+
end
|
43
|
+
return authorities
|
44
|
+
end
|
45
|
+
end
|
data/lib/wcid/hit.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
module WCID
|
2
|
+
|
3
|
+
#Conducting a search of the WorldCat Identities site returns a WCID::Hit
|
4
|
+
#object.
|
5
|
+
|
6
|
+
class Hit
|
7
|
+
|
8
|
+
|
9
|
+
|
10
|
+
attr_reader :score, :established_form, :uri, :citation, :lccn, :pubdates, :name_type
|
11
|
+
|
12
|
+
#Hit objects contain some potentially useful information, but of
|
13
|
+
#primary interest to me is the lccn which allows you to grab the
|
14
|
+
#authority record of a particular hit.
|
15
|
+
|
16
|
+
def initialize m
|
17
|
+
@score = m.attributes['score']
|
18
|
+
@established_form = m.elements["establishedForm"].text if m.elements["establishedForm"]
|
19
|
+
@uri = m.elements['uri'].text if m.elements['uri']
|
20
|
+
@citation = m.elements['citation'].text if m.elements['citation']
|
21
|
+
@lccn = m.elements['lccn'].text if m.elements['lccn']
|
22
|
+
@pubdates = m.elements['pubDates'].text if m.elements['pubDates']
|
23
|
+
@name_type = m.elements['nameType'].text if m.elements['nameType']
|
24
|
+
end
|
25
|
+
|
26
|
+
#Given a WCID::Hit object it returns a single authority record
|
27
|
+
#as a MARC::Record object.
|
28
|
+
def get_lc_auth
|
29
|
+
src = Net::HTTP.get(URI.parse("http://errol.oclc.org/laf/#{self.lccn}.MarcXML"))
|
30
|
+
record = MARC::XMLReader.new(StringIO.new(string=src)).to_a
|
31
|
+
record = record[0]
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
data/lib/wcid/id.rb
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
module WCID
|
2
|
+
|
3
|
+
#A WCID::ID object is a single Identity record in WorldCat Identities.
|
4
|
+
#
|
5
|
+
|
6
|
+
class ID
|
7
|
+
#lccn = Library of Congress Control Number
|
8
|
+
#type =
|
9
|
+
#wiki_link = link to Wikipedia for this identity
|
10
|
+
#subfield_a = name
|
11
|
+
#subfield_d = dates associated with the name
|
12
|
+
#pnkey = each WorldCat Identities record is given a unique pnkey made up
|
13
|
+
# the name and associated dates (and qualifier?). A pnkey is the most exact way to
|
14
|
+
# search WorldCat Identities.
|
15
|
+
|
16
|
+
attr_reader :lccn, :type, :wiki_link, :subfield_a, :subfield_d, :pnkey
|
17
|
+
def initialize doc
|
18
|
+
@record = doc.elements.to_a("searchRetrieveResponse/records/record")
|
19
|
+
@identity = @record[0].elements['recordData/Identity']
|
20
|
+
@pnkey = @identity.elements['pnkey'].text
|
21
|
+
@lccn_record = @identity.elements['authorityInfo/lccn'].text
|
22
|
+
@lccn = convert_lccn(@lccn_record)
|
23
|
+
@identity = @record[0].elements['recordData/Identity']
|
24
|
+
@type = @identity.attributes['type']
|
25
|
+
@wiki_link = "http://en.wikipedia.org/wiki/#{(@identity.elements['authorityInfo/wikiLink']).text}" if @identity.elements['authorityInfo/wikiLink']
|
26
|
+
@subfield_a = @identity.elements['authorityInfo/standardForm/suba'].text if @identity.elements['authorityInfo/standardForm/suba']
|
27
|
+
@subfield_d = @identity.elements['authorityInfo/standardForm/subd'].text if @identity.elements['authorityInfo/standardForm/subd']
|
28
|
+
end
|
29
|
+
|
30
|
+
#A method to convert the Library of Congress Control Number as represented
|
31
|
+
#in the WorldCat Identities database into an LCCN subtable for searching
|
32
|
+
#the Linked Authority File.
|
33
|
+
|
34
|
+
def convert_lccn(lccn)
|
35
|
+
lccn.gsub!(' ', '')
|
36
|
+
ll = lccn.length
|
37
|
+
if lccn[ll - 6, 1] != '0'
|
38
|
+
lccn = lccn[0, ll - 6] + "-" + lccn[ll - 6, 99]
|
39
|
+
elsif lccn[ll - 5, 1] != '0'
|
40
|
+
lccn = lccn[0, ll - 6] + "-" + lccn[ll - 5, 99]
|
41
|
+
elsif lccn[ll - 4, 1] != '0'
|
42
|
+
lccn = lccn[0, ll - 6] + "-" + lccn[ll - 4, 99]
|
43
|
+
elsif lccn[ll - 3, 1] != '0'
|
44
|
+
lccn = lccn[0, ll - 6] + "-" + lccn[ll - 3, 99]
|
45
|
+
end
|
46
|
+
lccn
|
47
|
+
end
|
48
|
+
|
49
|
+
#Returns a MARC::Record representation of the associated LC authority
|
50
|
+
#record.
|
51
|
+
|
52
|
+
def get_lc_auth
|
53
|
+
uri = URI.escape("http://errol.oclc.org/laf/#{self.lccn}.MarcXML")
|
54
|
+
src = ''
|
55
|
+
begin
|
56
|
+
Timeout.timeout 20 do
|
57
|
+
src = Net::HTTP.get(URI.parse("#{uri}"))
|
58
|
+
end
|
59
|
+
record = MARC::XMLReader.new(StringIO.new(string=src)).to_a[0]
|
60
|
+
end
|
61
|
+
rescue Timeout::Error
|
62
|
+
puts "The auth lookup timed out!"
|
63
|
+
raise
|
64
|
+
rescue MARC::Exception => e
|
65
|
+
puts "MARC exception: #{e}"
|
66
|
+
raise
|
67
|
+
rescue REXML::ParseException
|
68
|
+
puts "rexml error"
|
69
|
+
raise
|
70
|
+
rescue EOFError
|
71
|
+
puts "The service seems to be down!"
|
72
|
+
raise "eof"
|
73
|
+
rescue Exception => e
|
74
|
+
puts "some other type of exception"
|
75
|
+
puts e
|
76
|
+
raise
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
data/lib/wcid/marc.rb
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'marc'
|
2
|
+
|
3
|
+
module MARC
|
4
|
+
class DataField
|
5
|
+
|
6
|
+
#accept a MARC datafield and create a pnkey for query
|
7
|
+
#really just a bunch of regexps to tur
|
8
|
+
#This should only really work for exact searches if the datafield passed
|
9
|
+
#to the method had proper authority work done on it.
|
10
|
+
|
11
|
+
def pnkey_from_marc
|
12
|
+
name = self.name_from_datafield
|
13
|
+
dates = self.dates_from_datafield
|
14
|
+
qualifier = self.qualifier_from_datafield
|
15
|
+
|
16
|
+
query = "\"#{name}"
|
17
|
+
query += "$" + qualifier if qualifier
|
18
|
+
query += dates if dates
|
19
|
+
query += "\""
|
20
|
+
#puts query
|
21
|
+
query
|
22
|
+
end
|
23
|
+
|
24
|
+
#Should the following be private?
|
25
|
+
def name_from_datafield
|
26
|
+
name = self['a'].downcase
|
27
|
+
name.gsub!('.','')
|
28
|
+
name.gsub!(/,$/,'')
|
29
|
+
name.gsub!('-',' ')
|
30
|
+
name.gsub!('\'','')
|
31
|
+
name
|
32
|
+
end
|
33
|
+
def dates_from_datafield
|
34
|
+
dates = self['d'] if self['d']
|
35
|
+
dates.gsub!('-', ' ') if dates
|
36
|
+
dates.gsub!('.', '') if dates
|
37
|
+
dates = "$" + dates if dates
|
38
|
+
dates
|
39
|
+
end
|
40
|
+
def qualifier_from_datafield
|
41
|
+
qualifier = self['q'] if self['q']
|
42
|
+
qualifier.gsub!('-', ' ') if qualifier
|
43
|
+
qualifier.gsub!(',','') if qualifier
|
44
|
+
qualifier.gsub!('(','') if qualifier
|
45
|
+
qualifier.gsub!(')','') if qualifier
|
46
|
+
qualifier.downcase! if qualifier
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
class Record
|
51
|
+
|
52
|
+
#accept a MARC record and return an array of name fields to be used in queries
|
53
|
+
|
54
|
+
def name_fields
|
55
|
+
f700 = self.find_all {|f| f.tag == '700'}
|
56
|
+
name_100 = self['100'] if self['100']
|
57
|
+
name_110 = self['110'] if self['110']
|
58
|
+
names =[]
|
59
|
+
names << name_100 unless name_100.nil?
|
60
|
+
names << name_110 unless name_110.nil?
|
61
|
+
names << f700 unless f700.empty?
|
62
|
+
names.compact!
|
63
|
+
names.flatten!
|
64
|
+
#puts names.inspect
|
65
|
+
names
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
data/lib/wcid/search.rb
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
module WCID
|
2
|
+
|
3
|
+
#WCID::Search objects are for creating and running queries
|
4
|
+
#against WorldCat Identities.
|
5
|
+
|
6
|
+
class Search
|
7
|
+
attr_reader :query
|
8
|
+
|
9
|
+
#pass it a string
|
10
|
+
# name_search = WCID::Search.new('Twain, Mark')
|
11
|
+
|
12
|
+
def initialize query
|
13
|
+
@query = query
|
14
|
+
end
|
15
|
+
|
16
|
+
|
17
|
+
#Actually performs the search returns array of Hit objects
|
18
|
+
# hits = name_search.search
|
19
|
+
# or
|
20
|
+
# hits = (WCID::Search.new('Twain, Mark')).search
|
21
|
+
|
22
|
+
def search
|
23
|
+
uri = URI.escape("http://orlabs.oclc.org/Identities/find?fullName=#{@query}")
|
24
|
+
doc = do_search uri
|
25
|
+
m = doc.elements.to_a("nameAuthorities/match")
|
26
|
+
hits = []
|
27
|
+
m.each do | m |
|
28
|
+
hit_object = WCID::Hit.new(m)
|
29
|
+
hits << hit_object
|
30
|
+
end
|
31
|
+
hits
|
32
|
+
end
|
33
|
+
|
34
|
+
#returns single ID object (the first hit) or nil
|
35
|
+
# hit = name_search.search_exact
|
36
|
+
# or
|
37
|
+
# hit = (WCID::Search.new('Twain, Mark')).search_exact
|
38
|
+
|
39
|
+
def search_exact
|
40
|
+
uri = URI.escape("http://orlabs.oclc.org/SRW/search/Identities?query=local.pnkey+exact+#{@query}")
|
41
|
+
#puts uri
|
42
|
+
begin
|
43
|
+
doc = do_search uri
|
44
|
+
#puts doc
|
45
|
+
if doc.elements['searchRetrieveResponse/numberOfRecords'].text == '0'
|
46
|
+
#puts "no record"
|
47
|
+
return nil
|
48
|
+
else
|
49
|
+
#puts 'a record here'
|
50
|
+
record = WCID::ID.new(doc)
|
51
|
+
end
|
52
|
+
rescue
|
53
|
+
#puts 'um no text for node?'
|
54
|
+
raise
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
#This should maybe be private or somesuch?
|
59
|
+
def do_search uri
|
60
|
+
begin
|
61
|
+
file = ''
|
62
|
+
Timeout.timeout 20 do
|
63
|
+
file = Net::HTTP.get(URI.parse("#{uri}"))
|
64
|
+
#puts file
|
65
|
+
end
|
66
|
+
doc = REXML::Document.new file
|
67
|
+
rescue Timeout::Error
|
68
|
+
puts "timeout error!"
|
69
|
+
raise
|
70
|
+
rescue Errno::ECONNRESET
|
71
|
+
puts "errno!"
|
72
|
+
raise
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
|
78
|
+
end
|
data/lib/wcid/version.rb
ADDED
data/scripts/txt2html
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'redcloth'
|
5
|
+
require 'syntax/convertors/html'
|
6
|
+
require 'erb'
|
7
|
+
require File.dirname(__FILE__) + '/../lib/wcid/version.rb'
|
8
|
+
|
9
|
+
version = Wcid::VERSION::STRING
|
10
|
+
download = 'http://rubyforge.org/projects/wcid'
|
11
|
+
|
12
|
+
class Fixnum
|
13
|
+
def ordinal
|
14
|
+
# teens
|
15
|
+
return 'th' if (10..19).include?(self % 100)
|
16
|
+
# others
|
17
|
+
case self % 10
|
18
|
+
when 1: return 'st'
|
19
|
+
when 2: return 'nd'
|
20
|
+
when 3: return 'rd'
|
21
|
+
else return 'th'
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
class Time
|
27
|
+
def pretty
|
28
|
+
return "#{mday}#{mday.ordinal} #{strftime('%B')} #{year}"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def convert_syntax(syntax, source)
|
33
|
+
return Syntax::Convertors::HTML.for_syntax(syntax).convert(source).gsub(%r!^<pre>|</pre>$!,'')
|
34
|
+
end
|
35
|
+
|
36
|
+
if ARGV.length >= 1
|
37
|
+
src, template = ARGV
|
38
|
+
template ||= File.dirname(__FILE__) + '/../website/template.rhtml'
|
39
|
+
|
40
|
+
else
|
41
|
+
puts("Usage: #{File.split($0).last} source.txt [template.rhtml] > output.html")
|
42
|
+
exit!
|
43
|
+
end
|
44
|
+
|
45
|
+
template = ERB.new(File.open(template).read)
|
46
|
+
|
47
|
+
title = nil
|
48
|
+
body = nil
|
49
|
+
File.open(src) do |fsrc|
|
50
|
+
title_text = fsrc.readline
|
51
|
+
body_text = fsrc.read
|
52
|
+
syntax_items = []
|
53
|
+
body_text.gsub!(%r!<(pre|code)[^>]*?syntax=['"]([^'"]+)[^>]*>(.*?)</>!m){
|
54
|
+
ident = syntax_items.length
|
55
|
+
element, syntax, source = $1, $2, $3
|
56
|
+
syntax_items << "<#{element} class='syntax'>#{convert_syntax(syntax, source)}</#{element}>"
|
57
|
+
"syntax-temp-#{ident}"
|
58
|
+
}
|
59
|
+
title = RedCloth.new(title_text).to_html.gsub(%r!<.*?>!,'').strip
|
60
|
+
body = RedCloth.new(body_text).to_html
|
61
|
+
body.gsub!(%r!(?:<pre><code>)?syntax-temp-(d+)(?:</code></pre>)?!){ syntax_items[$1.to_i] }
|
62
|
+
end
|
63
|
+
stat = File.stat(src)
|
64
|
+
created = stat.ctime
|
65
|
+
modified = stat.mtime
|
66
|
+
|
67
|
+
$stdout << template.result(binding)
|