triple_parser 0.1.9
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +43 -0
- data/lib/main.rb +16 -0
- data/lib/triple_parser/bracketed_url_splitter.rb +183 -0
- data/lib/triple_parser/colon_separated_splitter.rb +95 -0
- data/lib/triple_parser/regional_text_splitter.rb +41 -0
- data/lib/triple_parser/settings.rb +12 -0
- data/lib/triple_parser/splitter.rb +52 -0
- data/lib/triple_parser/t_maker.rb +63 -0
- data/lib/triple_parser/third.rb +17 -0
- data/lib/triple_parser/to_rdf.rb +133 -0
- data/lib/triple_parser/triple_set.rb +123 -0
- data/lib/triple_parser/unspecified_splitter.rb +27 -0
- data/lib/triple_parser/variable_splitter.rb +30 -0
- data/lib/triple_parser.rb +46 -0
- metadata +58 -0
data/README.md
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
Triple parser
|
2
|
+
=============
|
3
|
+
|
4
|
+
Converts text containing RDF triples between simple and complex versions
|
5
|
+
|
6
|
+
For example:
|
7
|
+
|
8
|
+
triples = <<EOF
|
9
|
+
id:112121212111111111111 owl:event:time id:234242342342334234242432
|
10
|
+
id:234242342342334234242432 rdf:type owl:timeline:Interval
|
11
|
+
id:234242342342334234242432 owl:timeline:beginsAtDateTime xml:date_time:'2010-02-15T12:00:00Z'
|
12
|
+
id:234242342342334234242432 owl:timeline:endsAtDateTime xml:date_time:'2010-02-17T12:00:00Z'
|
13
|
+
EOF
|
14
|
+
|
15
|
+
TripleParser.to_rdf(triples)
|
16
|
+
|
17
|
+
Outputs:
|
18
|
+
|
19
|
+
[
|
20
|
+
"<http://en.wikipedia.org/wiki/Triplestore/things/112121212111111111111#id> <http://purl.org/NET/c4dm/event.owl#time> <http://en.wikipedia.org/wiki/Triplestore/things/234242342342334234242432#id> .",
|
21
|
+
"<http://en.wikipedia.org/wiki/Triplestore/things/234242342342334234242432#id> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/NET/c4dm/timeline.owl#Interval> .",
|
22
|
+
"<http://en.wikipedia.org/wiki/Triplestore/things/234242342342334234242432#id> <http://purl.org/NET/c4dm/timeline.owl#beginsAtDateTime> "2010-02-15T12:00:00Z"^^<http://www.w3.org/2001/XMLSchema#dateTime> .",
|
23
|
+
"<http://en.wikipedia.org/wiki/Triplestore/things/234242342342334234242432#id> <http://purl.org/NET/c4dm/timeline.owl#endsAtDateTime> "2010-02-17T12:00:00Z"^^<http://www.w3.org/2001/XMLSchema#dateTime> ."
|
24
|
+
]
|
25
|
+
|
26
|
+
Setting site specific application url
|
27
|
+
-------------------------------------
|
28
|
+
The default application url is 'en.wikipedia.org/wiki/Triplestore' where you can read more about triplestores.
|
29
|
+
To change this to your site specific url, use this (in a Rails initializer for example):
|
30
|
+
|
31
|
+
TripleParser::Settings.application_domain = 'undervale.co.uk'
|
32
|
+
|
33
|
+
Playground
|
34
|
+
----------
|
35
|
+
A simple Sinatra site is included, where you can enter triples and see how they converted by TripleParser.to_rdf
|
36
|
+
|
37
|
+
To play:
|
38
|
+
|
39
|
+
ruby web.rb
|
40
|
+
|
41
|
+
The page can then be viewed at http://localhost:4567
|
42
|
+
|
43
|
+
Enter your triples in the text area and click submit. The output will appear below the text area
|
data/lib/main.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
require_relative 'triple_parser'
|
2
|
+
|
3
|
+
text = <<EOF
|
4
|
+
id:9108fe02-0bbb-4ed9-890f-b454877ce12c type_is Event
|
5
|
+
id:9108fe02-0bbb-4ed9-890f-b454877ce12c name_is string:'Troops tighten grip on Taliban stronghold'
|
6
|
+
id:9108fe02-0bbb-4ed9-890f-b454877ce12c has_time id:0237eb08-e4a5-463c-baaa-5a28f2b63707
|
7
|
+
<http://www.undervale.co.uk/things/9108fe02-0bbb-4ed9-890f-b454877ce12c#id> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/NET/c4dm/event.owl#Event>.
|
8
|
+
EOF
|
9
|
+
|
10
|
+
TripleParser.input(text)
|
11
|
+
|
12
|
+
triples = TripleParser.triples
|
13
|
+
|
14
|
+
triples.each do |t|
|
15
|
+
p t.object.value
|
16
|
+
end
|
@@ -0,0 +1,183 @@
|
|
1
|
+
module TripleParser
|
2
|
+
|
3
|
+
require_relative 'splitter'
|
4
|
+
|
5
|
+
class BracketedUrlSplitter < Splitter
|
6
|
+
|
7
|
+
def self.can_split?(string)
|
8
|
+
bracketed_url_pattern =~ string
|
9
|
+
end
|
10
|
+
|
11
|
+
def rdf_style
|
12
|
+
'bracketed_url'
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
def self.bracketed_url_pattern
|
17
|
+
/\<http\:\/\/[\S]+\>/
|
18
|
+
end
|
19
|
+
|
20
|
+
def get_parts
|
21
|
+
type_value_from_bracketed_url.merge(
|
22
|
+
:url => get_url
|
23
|
+
)
|
24
|
+
end
|
25
|
+
|
26
|
+
def get_url
|
27
|
+
url_pattern = /http\:\/\/[\w\-\.]+\.[a-zA-Z]{2,7}(?:\/[\w\-\._]+)*/
|
28
|
+
match(url_pattern)[0]
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
def type_value_from_bracketed_url
|
33
|
+
if text_after_hash_pattern =~ self
|
34
|
+
type_value_from_text_after_hash_url
|
35
|
+
|
36
|
+
elsif resource_url_pattern =~ self
|
37
|
+
type_value_for_resource
|
38
|
+
|
39
|
+
elsif ontology_url_pattern =~ self
|
40
|
+
type_value_for_ontology
|
41
|
+
|
42
|
+
elsif dc_terms_pattern =~ self
|
43
|
+
type_value_for_dc_terms
|
44
|
+
|
45
|
+
elsif asset_pattern =~ self
|
46
|
+
type_value_for_asset
|
47
|
+
|
48
|
+
else
|
49
|
+
type_value_for_unknown_url
|
50
|
+
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def text_after_hash_pattern
|
55
|
+
/\#([a-zA-Z]+)/
|
56
|
+
end
|
57
|
+
|
58
|
+
def after_hash
|
59
|
+
@after_hash ||= match(text_after_hash_pattern)[1] if match(text_after_hash_pattern)
|
60
|
+
end
|
61
|
+
|
62
|
+
def type_value_from_text_after_hash_url
|
63
|
+
|
64
|
+
|
65
|
+
if after_hash == 'id'
|
66
|
+
type_value_for_id_after_hash
|
67
|
+
|
68
|
+
elsif xml_data_pattern =~ self
|
69
|
+
type_value_for_xml_schema
|
70
|
+
|
71
|
+
elsif rdf_url_pattern =~ self
|
72
|
+
type_value_for_rdf
|
73
|
+
|
74
|
+
elsif owl_pattern =~ self
|
75
|
+
type_value_for_owl
|
76
|
+
|
77
|
+
else
|
78
|
+
{}
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def xml_data_pattern
|
83
|
+
/["'](.+)["']\^{2}\<http/
|
84
|
+
end
|
85
|
+
|
86
|
+
def type_value_for_xml_schema
|
87
|
+
{
|
88
|
+
:type => "xml:#{underscore(after_hash)}",
|
89
|
+
:value => match(xml_data_pattern)[1]
|
90
|
+
}
|
91
|
+
end
|
92
|
+
|
93
|
+
def owl_pattern
|
94
|
+
/\.owl#/
|
95
|
+
end
|
96
|
+
|
97
|
+
def type_value_for_owl
|
98
|
+
type = match(text_before_hash_pattern)[1]
|
99
|
+
{
|
100
|
+
:type => "owl:#{type.gsub(/\.owl/, "")}",
|
101
|
+
:value => after_hash
|
102
|
+
}
|
103
|
+
end
|
104
|
+
|
105
|
+
def rdf_url_pattern
|
106
|
+
/rdf\-syntax\-ns/
|
107
|
+
end
|
108
|
+
|
109
|
+
def type_value_for_rdf
|
110
|
+
{
|
111
|
+
:type => 'rdf',
|
112
|
+
:value => after_hash
|
113
|
+
}
|
114
|
+
end
|
115
|
+
|
116
|
+
def type_value_for_id_after_hash
|
117
|
+
{
|
118
|
+
:type => 'id',
|
119
|
+
:value => match(text_before_hash_pattern)[1]
|
120
|
+
}
|
121
|
+
end
|
122
|
+
|
123
|
+
def text_before_hash_pattern
|
124
|
+
/([\w\-\._]*)\#/
|
125
|
+
end
|
126
|
+
|
127
|
+
def type_value_for_resource
|
128
|
+
{
|
129
|
+
:type => match(resource_url_pattern)[1],
|
130
|
+
:value => match(last_element_of_url_pattern)[1]
|
131
|
+
}
|
132
|
+
end
|
133
|
+
|
134
|
+
def ontology_url_pattern
|
135
|
+
/data\.press\.net\/ontology\/(?:\w+\/)+(\w+)/
|
136
|
+
end
|
137
|
+
|
138
|
+
def type_value_for_ontology
|
139
|
+
{
|
140
|
+
:type => 'ontology',
|
141
|
+
:value => match(ontology_url_pattern)[1]
|
142
|
+
}
|
143
|
+
end
|
144
|
+
|
145
|
+
def dc_terms_pattern
|
146
|
+
/\/dc\/terms\/([a-zA-Z_]+)/
|
147
|
+
end
|
148
|
+
|
149
|
+
def type_value_for_dc_terms
|
150
|
+
{
|
151
|
+
:type => 'dc:terms',
|
152
|
+
:value => match(dc_terms_pattern)[1]
|
153
|
+
}
|
154
|
+
end
|
155
|
+
|
156
|
+
def asset_pattern
|
157
|
+
/\/ontologies\/asset\/([a-zA-Z_]+)/
|
158
|
+
end
|
159
|
+
|
160
|
+
def type_value_for_asset
|
161
|
+
{
|
162
|
+
:type => 'asset',
|
163
|
+
:value => match(asset_pattern)[1]
|
164
|
+
}
|
165
|
+
end
|
166
|
+
|
167
|
+
def type_value_for_unknown_url
|
168
|
+
{
|
169
|
+
:url => get_url
|
170
|
+
}
|
171
|
+
end
|
172
|
+
|
173
|
+
def last_element_of_url_pattern
|
174
|
+
/\/([a-zA-Z_]+)\>/
|
175
|
+
end
|
176
|
+
|
177
|
+
def resource_url_pattern
|
178
|
+
/(#{resource_identifiers.join('|')})\/[a-zA-Z_]+\>/
|
179
|
+
end
|
180
|
+
|
181
|
+
end
|
182
|
+
|
183
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
module TripleParser
|
2
|
+
|
3
|
+
require_relative 'splitter'
|
4
|
+
|
5
|
+
class ColonSeparatedSplitter < Splitter
|
6
|
+
|
7
|
+
def self.can_split?(string)
|
8
|
+
colon_separated_rdf_pattern =~ string
|
9
|
+
end
|
10
|
+
|
11
|
+
def rdf_style
|
12
|
+
'colon_separated'
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
def self.colon_separated_rdf_pattern
|
17
|
+
colon_pair = '[a-z\-]+:[\w-]+' # this:example
|
18
|
+
colon_followed_by_quoted_string = %q{:(['"].+['"]|\w+)} # :'this example'
|
19
|
+
function = '\(.+\)' # (this, example)
|
20
|
+
colon_separated_text = "#{colon_pair}(#{colon_followed_by_quoted_string}|#{function})?" # this:example or this:example:'with string' or this:function(example)
|
21
|
+
|
22
|
+
Regexp.new(colon_separated_text)
|
23
|
+
end
|
24
|
+
|
25
|
+
def get_parts
|
26
|
+
if function_pattern =~ self
|
27
|
+
get_parts_for_function
|
28
|
+
|
29
|
+
elsif ontologies.include?(self)
|
30
|
+
get_ontology
|
31
|
+
|
32
|
+
elsif double_colon_first_elements.include?(before_colon)
|
33
|
+
get_double_colon_entry
|
34
|
+
|
35
|
+
elsif include?(':')
|
36
|
+
get_parts_for_type_value_pair
|
37
|
+
|
38
|
+
else
|
39
|
+
raise "Unable to get parts from '#{self}'"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def function_pattern
|
44
|
+
/^[\w_:]+\(.+\)/
|
45
|
+
end
|
46
|
+
|
47
|
+
def get_parts_for_function
|
48
|
+
opening_bracket = index('(')
|
49
|
+
closing_bracket = index(')')
|
50
|
+
name = self[0, opening_bracket]
|
51
|
+
@arguments = self[opening_bracket + 1..closing_bracket - 1].split(/[\,\s]+/)
|
52
|
+
@arguments = @arguments.collect!{|r| TMaker.brew(r)}
|
53
|
+
{:type => 'function', :value => name}
|
54
|
+
end
|
55
|
+
|
56
|
+
def get_parts_for_type_value_pair
|
57
|
+
{:type => before_colon, :value => after_colon}
|
58
|
+
end
|
59
|
+
|
60
|
+
def before_colon
|
61
|
+
@before_colon ||= self[0, (index(':'))] if include?(':')
|
62
|
+
end
|
63
|
+
|
64
|
+
def after_colon
|
65
|
+
@after_colon ||= self[index(':') + 1..length] if include?(':')
|
66
|
+
end
|
67
|
+
|
68
|
+
def remove_bracketing_quotes(text)
|
69
|
+
if /^'.+'$/ =~ text || /^".+"$/ =~ text
|
70
|
+
return text.gsub(/^['"]/, "").gsub(/['"]$/, "")
|
71
|
+
else
|
72
|
+
return text
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def double_colon_first_elements
|
77
|
+
%w{xml owl dc text}
|
78
|
+
end
|
79
|
+
|
80
|
+
def get_double_colon_entry
|
81
|
+
elements = split(':')
|
82
|
+
text_after_second_colon = elements[2..elements.length].join(':')
|
83
|
+
{
|
84
|
+
:type => "#{elements[0]}:#{elements[1]}",
|
85
|
+
:value => remove_bracketing_quotes(text_after_second_colon)
|
86
|
+
}
|
87
|
+
end
|
88
|
+
|
89
|
+
def get_ontology
|
90
|
+
{:type => 'ontology', :value => self}
|
91
|
+
end
|
92
|
+
|
93
|
+
end
|
94
|
+
|
95
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module TripleParser
|
2
|
+
|
3
|
+
class RegionalTextSplitter < Splitter
|
4
|
+
|
5
|
+
def self.can_split?(string)
|
6
|
+
quoted_text_ampersand_and_language_identifier =~ string
|
7
|
+
end
|
8
|
+
|
9
|
+
def rdf_style
|
10
|
+
'regional_text'
|
11
|
+
end
|
12
|
+
|
13
|
+
private
|
14
|
+
def self.quoted_text_ampersand_and_language_identifier
|
15
|
+
/('.*'|".*")\@[\w\-_]+/
|
16
|
+
end
|
17
|
+
|
18
|
+
def get_parts
|
19
|
+
{
|
20
|
+
:type => "text:#{region}",
|
21
|
+
:value => text
|
22
|
+
}
|
23
|
+
end
|
24
|
+
|
25
|
+
def region
|
26
|
+
after_ampersand
|
27
|
+
end
|
28
|
+
|
29
|
+
def text
|
30
|
+
before_ampersand.gsub(/['"]/, "")
|
31
|
+
end
|
32
|
+
|
33
|
+
def after_ampersand
|
34
|
+
split(/@/).last
|
35
|
+
end
|
36
|
+
|
37
|
+
def before_ampersand
|
38
|
+
split(/@/).first
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
|
2
|
+
module TripleParser
|
3
|
+
class Splitter < String
|
4
|
+
|
5
|
+
attr_accessor :parts, :arguments, :rdf_style, :url
|
6
|
+
|
7
|
+
def self.can_split?(string)
|
8
|
+
raise "Need to define test to determine if string can be converted using this class"
|
9
|
+
end
|
10
|
+
|
11
|
+
def parts
|
12
|
+
@parts ||= get_parts
|
13
|
+
end
|
14
|
+
|
15
|
+
def type
|
16
|
+
parts[:type]
|
17
|
+
end
|
18
|
+
|
19
|
+
def value
|
20
|
+
parts[:value]
|
21
|
+
end
|
22
|
+
|
23
|
+
def url
|
24
|
+
parts[:url]
|
25
|
+
end
|
26
|
+
|
27
|
+
def rdf_style
|
28
|
+
raise "Need to define string that identifies rdf stype"
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
def get_parts
|
33
|
+
raise "Need to define how to split text input into seperate parts"
|
34
|
+
end
|
35
|
+
|
36
|
+
def underscore(text)
|
37
|
+
while letter_before_capital = text.index(/[a-z][A-Z]/)
|
38
|
+
text.insert(letter_before_capital + 1, '_')
|
39
|
+
end
|
40
|
+
text.downcase
|
41
|
+
end
|
42
|
+
|
43
|
+
def resource_identifiers
|
44
|
+
%w{resource domain}
|
45
|
+
end
|
46
|
+
|
47
|
+
def ontologies
|
48
|
+
%w{about mentions}
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module TripleParser
|
2
|
+
class TMaker
|
3
|
+
require_relative 'third'
|
4
|
+
require_relative 'bracketed_url_splitter'
|
5
|
+
require_relative 'colon_separated_splitter'
|
6
|
+
require_relative 'variable_splitter'
|
7
|
+
require_relative 'unspecified_splitter'
|
8
|
+
require_relative 'regional_text_splitter'
|
9
|
+
|
10
|
+
attr_accessor :arguments, :rdf_style, :url
|
11
|
+
|
12
|
+
def self.brew(*args)
|
13
|
+
t_maker = new(*args)
|
14
|
+
t_maker.third
|
15
|
+
end
|
16
|
+
|
17
|
+
def initialize(*args)
|
18
|
+
@string = args.first
|
19
|
+
end
|
20
|
+
|
21
|
+
def third
|
22
|
+
begin
|
23
|
+
Third.new(
|
24
|
+
@string,
|
25
|
+
:type => split_text.type,
|
26
|
+
:value => split_text.value,
|
27
|
+
:url => split_text.url,
|
28
|
+
:rdf_style => split_text.rdf_style,
|
29
|
+
:arguments => split_text.arguments
|
30
|
+
)
|
31
|
+
rescue
|
32
|
+
Third.new(@string, :rdf_style => 'unknown')
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def split_text
|
37
|
+
@split_text ||= get_split_text
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
def get_split_text
|
42
|
+
splitters.each do |splitter|
|
43
|
+
if splitter.can_split?(@string)
|
44
|
+
split_text = splitter.new(@string)
|
45
|
+
return split_text
|
46
|
+
end
|
47
|
+
end
|
48
|
+
raise "Unable to get parts for third from: '#{@string}'"
|
49
|
+
end
|
50
|
+
|
51
|
+
def splitters
|
52
|
+
[
|
53
|
+
BracketedUrlSplitter,
|
54
|
+
ColonSeparatedSplitter,
|
55
|
+
VariableSplitter,
|
56
|
+
RegionalTextSplitter,
|
57
|
+
UnspecifiedSplitter
|
58
|
+
]
|
59
|
+
end
|
60
|
+
|
61
|
+
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module TripleParser
|
2
|
+
|
3
|
+
class Third < String
|
4
|
+
|
5
|
+
attr_accessor :type, :value, :arguments, :rdf_style, :url
|
6
|
+
|
7
|
+
def initialize(string, args = {})
|
8
|
+
@type = args[:type]
|
9
|
+
@value = args[:value]
|
10
|
+
@arguments = args[:arguments]
|
11
|
+
@rdf_style = args[:rdf_style]
|
12
|
+
@url = args[:url]
|
13
|
+
super(string)
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,133 @@
|
|
1
|
+
|
2
|
+
module TripleParser
|
3
|
+
class ToRdf
|
4
|
+
|
5
|
+
def initialize(third)
|
6
|
+
@third = third
|
7
|
+
end
|
8
|
+
|
9
|
+
def to_s
|
10
|
+
get_output.to_s
|
11
|
+
end
|
12
|
+
|
13
|
+
protected
|
14
|
+
def get_output
|
15
|
+
if third_type
|
16
|
+
pass_to_type_method
|
17
|
+
else
|
18
|
+
unknown
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def third_type
|
23
|
+
@third_type ||= @third.type if @third.type
|
24
|
+
end
|
25
|
+
|
26
|
+
def pass_to_type_method
|
27
|
+
if type_if_known_colon_pair?
|
28
|
+
send(colon_prefix)
|
29
|
+
|
30
|
+
elsif method_exists_for?(third_type)
|
31
|
+
send(third_type)
|
32
|
+
|
33
|
+
else
|
34
|
+
unknown_type
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def type_if_known_colon_pair?
|
39
|
+
if colon_pair_pattern =~ third_type
|
40
|
+
method_exists_for?(colon_prefix)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def method_exists_for?(name)
|
45
|
+
self.class.instance_methods.include?(name.to_sym)
|
46
|
+
end
|
47
|
+
|
48
|
+
# prefix:suffix
|
49
|
+
def colon_pair_pattern
|
50
|
+
/([a-zA-Z\-\_]+)\:([a-zA-Z\-\_]+)/
|
51
|
+
end
|
52
|
+
|
53
|
+
def colon_match
|
54
|
+
@colon_match ||= colon_pair_pattern.match(third_type)
|
55
|
+
end
|
56
|
+
|
57
|
+
def colon_prefix
|
58
|
+
@colon_prefix ||= colon_match[1]
|
59
|
+
end
|
60
|
+
|
61
|
+
def colon_suffix
|
62
|
+
@colon_suffix ||= colon_match[2]
|
63
|
+
end
|
64
|
+
|
65
|
+
def owl
|
66
|
+
"<http://purl.org/NET/c4dm/#{colon_suffix}.owl##{@third.value}>"
|
67
|
+
end
|
68
|
+
|
69
|
+
|
70
|
+
def xml
|
71
|
+
%Q{"#{@third.value}"^^<http://www.w3.org/2001/XMLSchema##{camelcase(colon_suffix)}>}
|
72
|
+
end
|
73
|
+
|
74
|
+
def dc
|
75
|
+
"<http://purl.org/dc/#{colon_suffix}/#{@third.value}>"
|
76
|
+
end
|
77
|
+
|
78
|
+
def text
|
79
|
+
%Q{"#{@third.value}"@#{colon_suffix}}
|
80
|
+
end
|
81
|
+
|
82
|
+
def unknown
|
83
|
+
@third
|
84
|
+
end
|
85
|
+
|
86
|
+
def unknown_type
|
87
|
+
"#{third_type}:#{@third.value}"
|
88
|
+
end
|
89
|
+
|
90
|
+
def id
|
91
|
+
"<http://#{Settings.application_domain}/things/#{@third.value}#id>"
|
92
|
+
end
|
93
|
+
|
94
|
+
def domain
|
95
|
+
"<http://#{Settings.application_domain}/ontologies/domain/name>"
|
96
|
+
end
|
97
|
+
|
98
|
+
def resource
|
99
|
+
"<http://dbpedia.org/resource/#{@third.value}>"
|
100
|
+
end
|
101
|
+
|
102
|
+
def ontology
|
103
|
+
"<http://data.press.net/ontology/tag/#{@third.value}>"
|
104
|
+
end
|
105
|
+
|
106
|
+
def asset
|
107
|
+
"<http://#{Settings.application_domain}/ontologies/asset/#{@third.value}>"
|
108
|
+
end
|
109
|
+
|
110
|
+
def function
|
111
|
+
arguments = @third.arguments.collect{|a| self.class.new(a)}
|
112
|
+
"#{@third.value}(#{arguments.join(' ')})"
|
113
|
+
end
|
114
|
+
|
115
|
+
def var
|
116
|
+
"?#{@third.value}"
|
117
|
+
end
|
118
|
+
|
119
|
+
def rdf
|
120
|
+
"<http://www.w3.org/1999/02/22-rdf-syntax-ns##{@third.value}>"
|
121
|
+
end
|
122
|
+
|
123
|
+
def camelcase(text)
|
124
|
+
while underscore_pos = text.index(/_[a-z]/)
|
125
|
+
letter_after_pos = underscore_pos + 1
|
126
|
+
letter_after = text[letter_after_pos, 1]
|
127
|
+
text[underscore_pos..letter_after_pos] = letter_after.upcase
|
128
|
+
end
|
129
|
+
return text
|
130
|
+
end
|
131
|
+
|
132
|
+
end
|
133
|
+
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
module TripleParser
|
2
|
+
class TripleSet
|
3
|
+
require_relative 't_maker'
|
4
|
+
|
5
|
+
def initialize(triple)
|
6
|
+
@triple = triple
|
7
|
+
end
|
8
|
+
|
9
|
+
def parts
|
10
|
+
@parts ||= get_parts
|
11
|
+
end
|
12
|
+
|
13
|
+
def subject
|
14
|
+
@subject ||= parts[0]
|
15
|
+
end
|
16
|
+
|
17
|
+
def predicate
|
18
|
+
@predicate ||= parts[1]
|
19
|
+
end
|
20
|
+
|
21
|
+
def object
|
22
|
+
@object ||= (!parts[2] || parts[2].empty?) ? nil : parts[2]
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
def get_parts
|
27
|
+
match = pattern_to_split_triple.match(@triple)
|
28
|
+
matches = [1, 2, 3].collect{|i| match[i] if i != @skip_triple_part}.compact
|
29
|
+
matches.collect{|m| TMaker.brew(m)}
|
30
|
+
end
|
31
|
+
|
32
|
+
def pattern_to_split_triple
|
33
|
+
if triple_is_function?
|
34
|
+
@skip_triple_part = 3
|
35
|
+
pattern_to_split_function
|
36
|
+
else
|
37
|
+
triple_spitting_pattern
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def triple_is_function?
|
42
|
+
function_pattern =~ @triple
|
43
|
+
end
|
44
|
+
|
45
|
+
def function_pattern
|
46
|
+
|
47
|
+
Regexp.new([
|
48
|
+
start_with_possible_white_space_pattern,
|
49
|
+
start_variable_or_bracketed_url_pattern,
|
50
|
+
receiving_variable_or_bracketed_url_pattern,
|
51
|
+
function_name_pattern,
|
52
|
+
function_arguments_pattern,
|
53
|
+
closing_white_space_or_period_pattern,
|
54
|
+
].join)
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
def pattern_to_split_function
|
59
|
+
|
60
|
+
receiving_element_pattern = '[\w?:\/_\-#<>\.]+'
|
61
|
+
|
62
|
+
Regexp.new(
|
63
|
+
[
|
64
|
+
'(',
|
65
|
+
receiving_element_pattern,
|
66
|
+
')',
|
67
|
+
spaces,
|
68
|
+
'(',
|
69
|
+
function_name_pattern,
|
70
|
+
function_arguments_pattern,
|
71
|
+
')'
|
72
|
+
].join
|
73
|
+
)
|
74
|
+
|
75
|
+
end
|
76
|
+
|
77
|
+
def triple_spitting_pattern
|
78
|
+
triple_containing_single_quoted_text = %q{\S*\'.*\'\S*}
|
79
|
+
triple_containing_double_quoted_text = %q{\S*\".*\"\S*}
|
80
|
+
text_not_split_by_spaces = '\S*'
|
81
|
+
triple = [triple_containing_single_quoted_text, triple_containing_double_quoted_text, text_not_split_by_spaces].join('|')
|
82
|
+
spaced_triples = Array.new(3, "(#{triple})").join('\s+')
|
83
|
+
Regexp.new(spaced_triples)
|
84
|
+
end
|
85
|
+
|
86
|
+
def spaces
|
87
|
+
'\s+'
|
88
|
+
end
|
89
|
+
|
90
|
+
def start_with_possible_white_space_pattern
|
91
|
+
'^\s*'
|
92
|
+
end
|
93
|
+
|
94
|
+
def start_variable_or_bracketed_url_pattern
|
95
|
+
'(\?|<http:\/\/)'
|
96
|
+
end
|
97
|
+
|
98
|
+
def receiving_variable_or_bracketed_url_pattern
|
99
|
+
'[\w\/\-_#\.]+>?\s+'
|
100
|
+
end
|
101
|
+
|
102
|
+
def basic_text_pattern
|
103
|
+
'\??[\w_\-:]+'
|
104
|
+
end
|
105
|
+
|
106
|
+
def function_name_pattern
|
107
|
+
'[\w_\-:]+'
|
108
|
+
end
|
109
|
+
|
110
|
+
def function_arguments_pattern
|
111
|
+
%q{\(([\w_\?:"']+[\s\,]*)+\)}
|
112
|
+
end
|
113
|
+
|
114
|
+
def closing_white_space_or_period_pattern
|
115
|
+
'[\s\.]*$'
|
116
|
+
end
|
117
|
+
|
118
|
+
def standard_rdf_element_or_text_pattern
|
119
|
+
'(?:<.*>|[\w\?\-:]+)'
|
120
|
+
end
|
121
|
+
|
122
|
+
end
|
123
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module TripleParser
|
2
|
+
|
3
|
+
require_relative 'splitter'
|
4
|
+
|
5
|
+
class UnspecifiedSplitter < Splitter
|
6
|
+
|
7
|
+
def self.can_split?(string)
|
8
|
+
any_word_possibly_hyphenated =~ string
|
9
|
+
end
|
10
|
+
|
11
|
+
def rdf_style
|
12
|
+
'unspecified'
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
def self.any_word_possibly_hyphenated
|
17
|
+
/^\s*[\w\-]*\s*$/
|
18
|
+
end
|
19
|
+
|
20
|
+
def get_parts
|
21
|
+
{}
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module TripleParser
|
2
|
+
|
3
|
+
class VariableSplitter < Splitter
|
4
|
+
|
5
|
+
def self.can_split?(string)
|
6
|
+
any_word_starting_with_question_mark =~ string
|
7
|
+
end
|
8
|
+
|
9
|
+
def rdf_style
|
10
|
+
'variable'
|
11
|
+
end
|
12
|
+
|
13
|
+
private
|
14
|
+
def self.any_word_starting_with_question_mark
|
15
|
+
/^\s*\?[A-Za-z][\w\-]*\s*$/
|
16
|
+
end
|
17
|
+
|
18
|
+
def get_parts
|
19
|
+
{
|
20
|
+
:type => 'var',
|
21
|
+
:value => variable_name
|
22
|
+
}
|
23
|
+
end
|
24
|
+
|
25
|
+
def variable_name
|
26
|
+
self[1..length]
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require_relative 'triple_parser/t_maker'
|
2
|
+
require_relative 'triple_parser/triple_set'
|
3
|
+
require_relative 'triple_parser/to_rdf'
|
4
|
+
require_relative 'triple_parser/settings'
|
5
|
+
|
6
|
+
|
7
|
+
module TripleParser
|
8
|
+
def self.input(new_input)
|
9
|
+
@input = new_input
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.triples
|
13
|
+
@triples = Array.new
|
14
|
+
case @input.class.to_s
|
15
|
+
when 'String'
|
16
|
+
@input.each_line do |triple|
|
17
|
+
next if /^\s*$/ =~ triple
|
18
|
+
@triples << TripleSet.new(triple)
|
19
|
+
end
|
20
|
+
when 'Array'
|
21
|
+
@input.compact.each do |triple|
|
22
|
+
@triples << TripleSet.new(triple)
|
23
|
+
end
|
24
|
+
else
|
25
|
+
raise "Input format not recognised"
|
26
|
+
end
|
27
|
+
|
28
|
+
return @triples
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.to_rdf(input)
|
32
|
+
@input = input
|
33
|
+
output = triples.collect do |t|
|
34
|
+
[
|
35
|
+
get_rdf_for(t.subject),
|
36
|
+
get_rdf_for(t.predicate),
|
37
|
+
get_rdf_for(t.object)
|
38
|
+
].join(' ') + " ."
|
39
|
+
end
|
40
|
+
return output
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.get_rdf_for(third)
|
44
|
+
ToRdf.new(third).to_s if third
|
45
|
+
end
|
46
|
+
end
|
metadata
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: triple_parser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.9
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Rob Nichols
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-08-07 00:00:00.000000000Z
|
13
|
+
dependencies: []
|
14
|
+
description: Triple Parser - Parses RDF triples and converts them into standard format
|
15
|
+
email: rob@undervale.co.uk
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- README.md
|
21
|
+
- lib/main.rb
|
22
|
+
- lib/triple_parser.rb
|
23
|
+
- lib/triple_parser/triple_set.rb
|
24
|
+
- lib/triple_parser/variable_splitter.rb
|
25
|
+
- lib/triple_parser/third.rb
|
26
|
+
- lib/triple_parser/t_maker.rb
|
27
|
+
- lib/triple_parser/colon_separated_splitter.rb
|
28
|
+
- lib/triple_parser/splitter.rb
|
29
|
+
- lib/triple_parser/regional_text_splitter.rb
|
30
|
+
- lib/triple_parser/to_rdf.rb
|
31
|
+
- lib/triple_parser/unspecified_splitter.rb
|
32
|
+
- lib/triple_parser/bracketed_url_splitter.rb
|
33
|
+
- lib/triple_parser/settings.rb
|
34
|
+
homepage: https://github.com/reggieb/triple_parser
|
35
|
+
licenses: []
|
36
|
+
post_install_message:
|
37
|
+
rdoc_options: []
|
38
|
+
require_paths:
|
39
|
+
- lib
|
40
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
47
|
+
none: false
|
48
|
+
requirements:
|
49
|
+
- - ! '>='
|
50
|
+
- !ruby/object:Gem::Version
|
51
|
+
version: '0'
|
52
|
+
requirements: []
|
53
|
+
rubyforge_project:
|
54
|
+
rubygems_version: 1.8.10
|
55
|
+
signing_key:
|
56
|
+
specification_version: 3
|
57
|
+
summary: Triple Parser - Parses RDF triples and converts them into standard format
|
58
|
+
test_files: []
|