wcid 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +4 -0
- data/LICENSE +339 -0
- data/Manifest.txt +18 -0
- data/README.txt +26 -0
- data/Rakefile +123 -0
- data/bin/wcid_ex1.rb +12 -0
- data/examples/wcid_long_ex1.rb +84 -0
- data/lib/wcid.rb +45 -0
- data/lib/wcid/hit.rb +34 -0
- data/lib/wcid/id.rb +80 -0
- data/lib/wcid/marc.rb +69 -0
- data/lib/wcid/search.rb +78 -0
- data/lib/wcid/version.rb +9 -0
- data/scripts/txt2html +67 -0
- data/setup.rb +1585 -0
- data/test/test_helper.rb +2 -0
- data/test/test_wcid.rb +11 -0
- metadata +66 -0
data/bin/wcid_ex1.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
#!/usr/bin/ruby -w
|
2
|
+
require 'wcid'
|
3
|
+
include WCID
|
4
|
+
|
5
|
+
begin
|
6
|
+
authorities = marc_file_to_auths(ARGV[0])
|
7
|
+
rescue
|
8
|
+
puts 'There has been an error! Check that the WorldCat Identities site is up at http://orlabs.oclc.org/Identities/ before complaining to the developer. The site is often down as it is now only in beta.'
|
9
|
+
exit
|
10
|
+
end
|
11
|
+
|
12
|
+
puts authorities
|
@@ -0,0 +1,84 @@
|
|
1
|
+
#!/usr/bin/ruby -w
|
2
|
+
## Jason Ronallo
|
3
|
+
# May 2007
|
4
|
+
|
5
|
+
require 'rubygems'
|
6
|
+
require "rexml/document"
|
7
|
+
require "rexml/xpath"
|
8
|
+
require 'net/http'
|
9
|
+
require 'uri'
|
10
|
+
require 'marc'
|
11
|
+
require 'stringio'
|
12
|
+
require 'worldcatid'
|
13
|
+
|
14
|
+
#file = ''
|
15
|
+
if ARGF.filename == '-'
|
16
|
+
print "Enter the full path to a MARC file: "
|
17
|
+
file = (STDIN.gets).chomp
|
18
|
+
else
|
19
|
+
file = ARGF
|
20
|
+
end
|
21
|
+
|
22
|
+
#wc_search = (WCID::Search.new('Hull, Richard 1945')).search_exact
|
23
|
+
x = 0
|
24
|
+
begin
|
25
|
+
records = MARC::Reader.new(file).to_a
|
26
|
+
rescue MARC::Exception
|
27
|
+
puts "MARC Exception"
|
28
|
+
exit
|
29
|
+
rescue Exception => e
|
30
|
+
puts e
|
31
|
+
print "Enter a valid filename: "
|
32
|
+
file = STDIN.gets.chomp
|
33
|
+
retry
|
34
|
+
end
|
35
|
+
puts "records.length: #{records.length}"
|
36
|
+
#STDIN.gets
|
37
|
+
records.each do | record |
|
38
|
+
fields = record.name_fields
|
39
|
+
puts "fields.length #{fields.length}"
|
40
|
+
#STDIN.gets
|
41
|
+
fields.each do | field |
|
42
|
+
query = field.pnkey_from_marc
|
43
|
+
rec = (WCID::Search.new(query)).search_exact
|
44
|
+
if rec == nil
|
45
|
+
puts "There's no rec for #{query}"
|
46
|
+
puts "Hit enter to continue"
|
47
|
+
STDIN.gets
|
48
|
+
next
|
49
|
+
end
|
50
|
+
puts "class of rec: #{rec.class}"
|
51
|
+
|
52
|
+
lc_retries = 0
|
53
|
+
begin
|
54
|
+
auth = rec.get_lc_auth
|
55
|
+
rescue Timeout::Error, Exception => e
|
56
|
+
#puts e; STDIN.gets
|
57
|
+
if e.to_s == "eof"
|
58
|
+
puts "We'll retry in a long time...."
|
59
|
+
sleep 1000
|
60
|
+
retry
|
61
|
+
end
|
62
|
+
if lc_retries < 5
|
63
|
+
puts "retrying...."
|
64
|
+
lc_retries += 1
|
65
|
+
puts
|
66
|
+
puts "retry #{lc_retries}"
|
67
|
+
retry
|
68
|
+
else
|
69
|
+
puts "you're outta luck on the lc_auth lookup"
|
70
|
+
next
|
71
|
+
end
|
72
|
+
end
|
73
|
+
if auth == nil
|
74
|
+
puts "auth nil"
|
75
|
+
STDIN.gets
|
76
|
+
next
|
77
|
+
end
|
78
|
+
puts auth
|
79
|
+
puts auth.inspect
|
80
|
+
STDIN.gets
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
puts "this is the end of the line"
|
data/lib/wcid.rb
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
#wcid is a ruby library for handling WorldCat Identities
|
2
|
+
#for more information on WorldCat Identities see #http://orlabs.oclc.org/Identities/
|
3
|
+
#
|
4
|
+
#USAGE
|
5
|
+
#
|
6
|
+
# require 'wcid'
|
7
|
+
|
8
|
+
|
9
|
+
require 'rubygems'
|
10
|
+
require 'rexml/document'
|
11
|
+
require 'net/http'
|
12
|
+
require 'uri'
|
13
|
+
require 'marc'
|
14
|
+
require 'stringio'
|
15
|
+
|
16
|
+
require 'wcid/version'
|
17
|
+
|
18
|
+
require 'wcid/marc'
|
19
|
+
require 'wcid/search'
|
20
|
+
require 'wcid/hit'
|
21
|
+
require 'wcid/id'
|
22
|
+
|
23
|
+
module WCID
|
24
|
+
|
25
|
+
#given one or more MARC records
|
26
|
+
#it returns an array of associated name authorities
|
27
|
+
#as MARC::Record objects
|
28
|
+
|
29
|
+
def marc_file_to_auths(filename)
|
30
|
+
authorities = []
|
31
|
+
records = MARC::Reader.new(filename).to_a
|
32
|
+
records.each do | record |
|
33
|
+
fields = record.name_fields
|
34
|
+
fields.each do | field |
|
35
|
+
query = field.pnkey_from_marc
|
36
|
+
wcid_obj = (WCID::Search.new(query)).search_exact
|
37
|
+
next if wcid_obj.nil?
|
38
|
+
auth = wcid_obj.get_lc_auth
|
39
|
+
next if auth.nil?
|
40
|
+
authorities << auth
|
41
|
+
end
|
42
|
+
end
|
43
|
+
return authorities
|
44
|
+
end
|
45
|
+
end
|
data/lib/wcid/hit.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
module WCID
|
2
|
+
|
3
|
+
#Conducting a search of the WorldCat Identities site returns a WCID::Hit
|
4
|
+
#object.
|
5
|
+
|
6
|
+
class Hit
|
7
|
+
|
8
|
+
|
9
|
+
|
10
|
+
attr_reader :score, :established_form, :uri, :citation, :lccn, :pubdates, :name_type
|
11
|
+
|
12
|
+
#Hit objects contain some potentially useful information, but of
|
13
|
+
#primary interest to me is the lccn which allows you to grab the
|
14
|
+
#authority record of a particular hit.
|
15
|
+
|
16
|
+
def initialize m
|
17
|
+
@score = m.attributes['score']
|
18
|
+
@established_form = m.elements["establishedForm"].text if m.elements["establishedForm"]
|
19
|
+
@uri = m.elements['uri'].text if m.elements['uri']
|
20
|
+
@citation = m.elements['citation'].text if m.elements['citation']
|
21
|
+
@lccn = m.elements['lccn'].text if m.elements['lccn']
|
22
|
+
@pubdates = m.elements['pubDates'].text if m.elements['pubDates']
|
23
|
+
@name_type = m.elements['nameType'].text if m.elements['nameType']
|
24
|
+
end
|
25
|
+
|
26
|
+
#Given a WCID::Hit object it returns a single authority record
|
27
|
+
#as a MARC::Record object.
|
28
|
+
def get_lc_auth
|
29
|
+
src = Net::HTTP.get(URI.parse("http://errol.oclc.org/laf/#{self.lccn}.MarcXML"))
|
30
|
+
record = MARC::XMLReader.new(StringIO.new(string=src)).to_a
|
31
|
+
record = record[0]
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
data/lib/wcid/id.rb
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
module WCID
|
2
|
+
|
3
|
+
#A WCID::ID object is a single Identity record in WorldCat Identities.
|
4
|
+
#
|
5
|
+
|
6
|
+
class ID
|
7
|
+
#lccn = Library of Congress Control Number
|
8
|
+
#type =
|
9
|
+
#wiki_link = link to Wikipedia for this identity
|
10
|
+
#subfield_a = name
|
11
|
+
#subfield_d = dates associated with the name
|
12
|
+
#pnkey = each WorldCat Identities record is given a unique pnkey made up
|
13
|
+
# the name and associated dates (and qualifier?). A pnkey is the most exact way to
|
14
|
+
# search WorldCat Identities.
|
15
|
+
|
16
|
+
attr_reader :lccn, :type, :wiki_link, :subfield_a, :subfield_d, :pnkey
|
17
|
+
def initialize doc
|
18
|
+
@record = doc.elements.to_a("searchRetrieveResponse/records/record")
|
19
|
+
@identity = @record[0].elements['recordData/Identity']
|
20
|
+
@pnkey = @identity.elements['pnkey'].text
|
21
|
+
@lccn_record = @identity.elements['authorityInfo/lccn'].text
|
22
|
+
@lccn = convert_lccn(@lccn_record)
|
23
|
+
@identity = @record[0].elements['recordData/Identity']
|
24
|
+
@type = @identity.attributes['type']
|
25
|
+
@wiki_link = "http://en.wikipedia.org/wiki/#{(@identity.elements['authorityInfo/wikiLink']).text}" if @identity.elements['authorityInfo/wikiLink']
|
26
|
+
@subfield_a = @identity.elements['authorityInfo/standardForm/suba'].text if @identity.elements['authorityInfo/standardForm/suba']
|
27
|
+
@subfield_d = @identity.elements['authorityInfo/standardForm/subd'].text if @identity.elements['authorityInfo/standardForm/subd']
|
28
|
+
end
|
29
|
+
|
30
|
+
#A method to convert the Library of Congress Control Number as represented
|
31
|
+
#in the WorldCat Identities database into an LCCN subtable for searching
|
32
|
+
#the Linked Authority File.
|
33
|
+
|
34
|
+
def convert_lccn(lccn)
|
35
|
+
lccn.gsub!(' ', '')
|
36
|
+
ll = lccn.length
|
37
|
+
if lccn[ll - 6, 1] != '0'
|
38
|
+
lccn = lccn[0, ll - 6] + "-" + lccn[ll - 6, 99]
|
39
|
+
elsif lccn[ll - 5, 1] != '0'
|
40
|
+
lccn = lccn[0, ll - 6] + "-" + lccn[ll - 5, 99]
|
41
|
+
elsif lccn[ll - 4, 1] != '0'
|
42
|
+
lccn = lccn[0, ll - 6] + "-" + lccn[ll - 4, 99]
|
43
|
+
elsif lccn[ll - 3, 1] != '0'
|
44
|
+
lccn = lccn[0, ll - 6] + "-" + lccn[ll - 3, 99]
|
45
|
+
end
|
46
|
+
lccn
|
47
|
+
end
|
48
|
+
|
49
|
+
#Returns a MARC::Record representation of the associated LC authority
|
50
|
+
#record.
|
51
|
+
|
52
|
+
def get_lc_auth
|
53
|
+
uri = URI.escape("http://errol.oclc.org/laf/#{self.lccn}.MarcXML")
|
54
|
+
src = ''
|
55
|
+
begin
|
56
|
+
Timeout.timeout 20 do
|
57
|
+
src = Net::HTTP.get(URI.parse("#{uri}"))
|
58
|
+
end
|
59
|
+
record = MARC::XMLReader.new(StringIO.new(string=src)).to_a[0]
|
60
|
+
end
|
61
|
+
rescue Timeout::Error
|
62
|
+
puts "The auth lookup timed out!"
|
63
|
+
raise
|
64
|
+
rescue MARC::Exception => e
|
65
|
+
puts "MARC exception: #{e}"
|
66
|
+
raise
|
67
|
+
rescue REXML::ParseException
|
68
|
+
puts "rexml error"
|
69
|
+
raise
|
70
|
+
rescue EOFError
|
71
|
+
puts "The service seems to be down!"
|
72
|
+
raise "eof"
|
73
|
+
rescue Exception => e
|
74
|
+
puts "some other type of exception"
|
75
|
+
puts e
|
76
|
+
raise
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
data/lib/wcid/marc.rb
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'marc'
|
2
|
+
|
3
|
+
module MARC
|
4
|
+
class DataField
|
5
|
+
|
6
|
+
#accept a MARC datafield and create a pnkey for query
|
7
|
+
#really just a bunch of regexps to tur
|
8
|
+
#This should only really work for exact searches if the datafield passed
|
9
|
+
#to the method had proper authority work done on it.
|
10
|
+
|
11
|
+
def pnkey_from_marc
|
12
|
+
name = self.name_from_datafield
|
13
|
+
dates = self.dates_from_datafield
|
14
|
+
qualifier = self.qualifier_from_datafield
|
15
|
+
|
16
|
+
query = "\"#{name}"
|
17
|
+
query += "$" + qualifier if qualifier
|
18
|
+
query += dates if dates
|
19
|
+
query += "\""
|
20
|
+
#puts query
|
21
|
+
query
|
22
|
+
end
|
23
|
+
|
24
|
+
#Should the following be private?
|
25
|
+
def name_from_datafield
|
26
|
+
name = self['a'].downcase
|
27
|
+
name.gsub!('.','')
|
28
|
+
name.gsub!(/,$/,'')
|
29
|
+
name.gsub!('-',' ')
|
30
|
+
name.gsub!('\'','')
|
31
|
+
name
|
32
|
+
end
|
33
|
+
def dates_from_datafield
|
34
|
+
dates = self['d'] if self['d']
|
35
|
+
dates.gsub!('-', ' ') if dates
|
36
|
+
dates.gsub!('.', '') if dates
|
37
|
+
dates = "$" + dates if dates
|
38
|
+
dates
|
39
|
+
end
|
40
|
+
def qualifier_from_datafield
|
41
|
+
qualifier = self['q'] if self['q']
|
42
|
+
qualifier.gsub!('-', ' ') if qualifier
|
43
|
+
qualifier.gsub!(',','') if qualifier
|
44
|
+
qualifier.gsub!('(','') if qualifier
|
45
|
+
qualifier.gsub!(')','') if qualifier
|
46
|
+
qualifier.downcase! if qualifier
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
class Record
|
51
|
+
|
52
|
+
#accept a MARC record and return an array of name fields to be used in queries
|
53
|
+
|
54
|
+
def name_fields
|
55
|
+
f700 = self.find_all {|f| f.tag == '700'}
|
56
|
+
name_100 = self['100'] if self['100']
|
57
|
+
name_110 = self['110'] if self['110']
|
58
|
+
names =[]
|
59
|
+
names << name_100 unless name_100.nil?
|
60
|
+
names << name_110 unless name_110.nil?
|
61
|
+
names << f700 unless f700.empty?
|
62
|
+
names.compact!
|
63
|
+
names.flatten!
|
64
|
+
#puts names.inspect
|
65
|
+
names
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
data/lib/wcid/search.rb
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
module WCID
|
2
|
+
|
3
|
+
#WCID::Search objects are for creating and running queries
|
4
|
+
#against WorldCat Identities.
|
5
|
+
|
6
|
+
class Search
|
7
|
+
attr_reader :query
|
8
|
+
|
9
|
+
#pass it a string
|
10
|
+
# name_search = WCID::Search.new('Twain, Mark')
|
11
|
+
|
12
|
+
def initialize query
|
13
|
+
@query = query
|
14
|
+
end
|
15
|
+
|
16
|
+
|
17
|
+
#Actually performs the search returns array of Hit objects
|
18
|
+
# hits = name_search.search
|
19
|
+
# or
|
20
|
+
# hits = (WCID::Search.new('Twain, Mark')).search
|
21
|
+
|
22
|
+
def search
|
23
|
+
uri = URI.escape("http://orlabs.oclc.org/Identities/find?fullName=#{@query}")
|
24
|
+
doc = do_search uri
|
25
|
+
m = doc.elements.to_a("nameAuthorities/match")
|
26
|
+
hits = []
|
27
|
+
m.each do | m |
|
28
|
+
hit_object = WCID::Hit.new(m)
|
29
|
+
hits << hit_object
|
30
|
+
end
|
31
|
+
hits
|
32
|
+
end
|
33
|
+
|
34
|
+
#returns single ID object (the first hit) or nil
|
35
|
+
# hit = name_search.search_exact
|
36
|
+
# or
|
37
|
+
# hit = (WCID::Search.new('Twain, Mark')).search_exact
|
38
|
+
|
39
|
+
def search_exact
|
40
|
+
uri = URI.escape("http://orlabs.oclc.org/SRW/search/Identities?query=local.pnkey+exact+#{@query}")
|
41
|
+
#puts uri
|
42
|
+
begin
|
43
|
+
doc = do_search uri
|
44
|
+
#puts doc
|
45
|
+
if doc.elements['searchRetrieveResponse/numberOfRecords'].text == '0'
|
46
|
+
#puts "no record"
|
47
|
+
return nil
|
48
|
+
else
|
49
|
+
#puts 'a record here'
|
50
|
+
record = WCID::ID.new(doc)
|
51
|
+
end
|
52
|
+
rescue
|
53
|
+
#puts 'um no text for node?'
|
54
|
+
raise
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
#This should maybe be private or somesuch?
|
59
|
+
def do_search uri
|
60
|
+
begin
|
61
|
+
file = ''
|
62
|
+
Timeout.timeout 20 do
|
63
|
+
file = Net::HTTP.get(URI.parse("#{uri}"))
|
64
|
+
#puts file
|
65
|
+
end
|
66
|
+
doc = REXML::Document.new file
|
67
|
+
rescue Timeout::Error
|
68
|
+
puts "timeout error!"
|
69
|
+
raise
|
70
|
+
rescue Errno::ECONNRESET
|
71
|
+
puts "errno!"
|
72
|
+
raise
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
|
78
|
+
end
|
data/lib/wcid/version.rb
ADDED
data/scripts/txt2html
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'redcloth'
|
5
|
+
require 'syntax/convertors/html'
|
6
|
+
require 'erb'
|
7
|
+
require File.dirname(__FILE__) + '/../lib/wcid/version.rb'
|
8
|
+
|
9
|
+
version = Wcid::VERSION::STRING
|
10
|
+
download = 'http://rubyforge.org/projects/wcid'
|
11
|
+
|
12
|
+
class Fixnum
|
13
|
+
def ordinal
|
14
|
+
# teens
|
15
|
+
return 'th' if (10..19).include?(self % 100)
|
16
|
+
# others
|
17
|
+
case self % 10
|
18
|
+
when 1: return 'st'
|
19
|
+
when 2: return 'nd'
|
20
|
+
when 3: return 'rd'
|
21
|
+
else return 'th'
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
class Time
|
27
|
+
def pretty
|
28
|
+
return "#{mday}#{mday.ordinal} #{strftime('%B')} #{year}"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def convert_syntax(syntax, source)
|
33
|
+
return Syntax::Convertors::HTML.for_syntax(syntax).convert(source).gsub(%r!^<pre>|</pre>$!,'')
|
34
|
+
end
|
35
|
+
|
36
|
+
if ARGV.length >= 1
|
37
|
+
src, template = ARGV
|
38
|
+
template ||= File.dirname(__FILE__) + '/../website/template.rhtml'
|
39
|
+
|
40
|
+
else
|
41
|
+
puts("Usage: #{File.split($0).last} source.txt [template.rhtml] > output.html")
|
42
|
+
exit!
|
43
|
+
end
|
44
|
+
|
45
|
+
template = ERB.new(File.open(template).read)
|
46
|
+
|
47
|
+
title = nil
|
48
|
+
body = nil
|
49
|
+
File.open(src) do |fsrc|
|
50
|
+
title_text = fsrc.readline
|
51
|
+
body_text = fsrc.read
|
52
|
+
syntax_items = []
|
53
|
+
body_text.gsub!(%r!<(pre|code)[^>]*?syntax=['"]([^'"]+)[^>]*>(.*?)</>!m){
|
54
|
+
ident = syntax_items.length
|
55
|
+
element, syntax, source = $1, $2, $3
|
56
|
+
syntax_items << "<#{element} class='syntax'>#{convert_syntax(syntax, source)}</#{element}>"
|
57
|
+
"syntax-temp-#{ident}"
|
58
|
+
}
|
59
|
+
title = RedCloth.new(title_text).to_html.gsub(%r!<.*?>!,'').strip
|
60
|
+
body = RedCloth.new(body_text).to_html
|
61
|
+
body.gsub!(%r!(?:<pre><code>)?syntax-temp-(d+)(?:</code></pre>)?!){ syntax_items[$1.to_i] }
|
62
|
+
end
|
63
|
+
stat = File.stat(src)
|
64
|
+
created = stat.ctime
|
65
|
+
modified = stat.mtime
|
66
|
+
|
67
|
+
$stdout << template.result(binding)
|