content_focus 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +33 -0
- data/examples/parse_twitter_profile.rb +11 -0
- data/lib/content_focus/html.rb +309 -0
- data/lib/content_focus/lexicon.txt +92662 -0
- data/lib/content_focus/linguistics.rb +147 -0
- data/lib/content_focus.rb +21 -0
- data/spec/content_focus_spec.rb +38 -0
- data/spec/data/confreaks.html +2634 -0
- data/spec/data/google_code_statistics.html +171 -0
- data/spec/data/kakuteru_article.html +199 -0
- data/spec/data/kakuteru_index.html +626 -0
- data/spec/data/movable_type_article.html +1243 -0
- data/spec/data/movable_type_index.html +1503 -0
- data/spec/data/simple_with_navigation.html +24 -0
- data/spec/data/twitter_profile.html +548 -0
- data/spec/data/typad_article.html +1421 -0
- data/spec/data/wordpress_article.html +2004 -0
- data/spec/data/wordpress_custom_article.html +527 -0
- metadata +83 -0
@@ -0,0 +1,147 @@
|
|
1
|
+
# tagger.rb - a Ruby Part of Speech Tagger patterned on the work of Eric Brill. Version 0.1
|
2
|
+
#
|
3
|
+
# Copyright 2005 Mark Watson. All rights reserved.
|
4
|
+
# This software is released under the LGPL
|
5
|
+
#
|
6
|
+
# Contributor: Pat Eyler
|
7
|
+
#
|
8
|
+
|
9
|
+
module ContentFocus
|
10
|
+
|
11
|
+
module Linguistics
|
12
|
+
|
13
|
+
class Tagger
|
14
|
+
|
15
|
+
UNINTERESTING_PARTS_OF_SPEECH = ['DT', 'PRP', 'IN', 'CC', 'MD']
|
16
|
+
|
17
|
+
def initialize()
|
18
|
+
@lexicon = {}
|
19
|
+
file = File.new(File.join(File.dirname(__FILE__), 'lexicon.txt'), 'r')
|
20
|
+
file.each_line {|line|
|
21
|
+
toks=line.split
|
22
|
+
@lexicon[toks.shift]=toks
|
23
|
+
}
|
24
|
+
file.close
|
25
|
+
end
|
26
|
+
|
27
|
+
def tokenize(words)
|
28
|
+
words.split(/ |,|\.|\:|\;|\'/) #'
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.keywords_for_caption(caption)
|
32
|
+
@@tagger ||= self.new
|
33
|
+
keywords = []
|
34
|
+
all_keywords = @@tagger.tokenize(caption)
|
35
|
+
pos_tags = @@tagger.part_of_speech_tag(all_keywords)
|
36
|
+
all_keywords.each_with_index do |keyword,i|
|
37
|
+
next if UNINTERESTING_PARTS_OF_SPEECH.include?(pos_tags[i])
|
38
|
+
next unless keyword.size > 4
|
39
|
+
keywords << keyword.downcase
|
40
|
+
end
|
41
|
+
keywords
|
42
|
+
rescue => e
|
43
|
+
all_keywords
|
44
|
+
end
|
45
|
+
|
46
|
+
def part_of_speech_tag(text)
|
47
|
+
|
48
|
+
## start by tokenizing strings passed in
|
49
|
+
if text.class == String then
|
50
|
+
text = tokenize(text)
|
51
|
+
end
|
52
|
+
|
53
|
+
## we only work on arrays. If text isn't an array,
|
54
|
+
## quit now.
|
55
|
+
if text.class != Array then
|
56
|
+
raise RuntimeError, "can't tokenize #{text.class}"
|
57
|
+
end
|
58
|
+
|
59
|
+
# this looks like an artifact of testing
|
60
|
+
# puts "text:",text,"\n"
|
61
|
+
ret = []
|
62
|
+
|
63
|
+
text.each do
|
64
|
+
|w| ret << (@lexicon[w] && @lexicon[w][0]) ||
|
65
|
+
(@lexicon[w.downcase] && words[w.downcase][0]) ||
|
66
|
+
'NN'
|
67
|
+
end
|
68
|
+
|
69
|
+
## Now, apply transformational rules:
|
70
|
+
text.length.times do |i|
|
71
|
+
|
72
|
+
## rule 1: DT, {VBD | VBP} --> DT, NN
|
73
|
+
if i > 0 then
|
74
|
+
if ret[i - 1] == "DT" then
|
75
|
+
if ret[i] == "VBD" or ret[i] == "VBP" or ret[i] == "VB" then
|
76
|
+
ret[i] = "NN"
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
## rule 2: convert a noun to a number (CD) if "." appears in the word
|
82
|
+
if ret[i] =~ /^N/ then
|
83
|
+
if text[i] =~ /\./ then
|
84
|
+
ret[i] = "CD"
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
## rule 3: convert a noun to a past participle if words[i] ends
|
89
|
+
## with "ed"
|
90
|
+
if ret[i] =~ /^N/ && text[i] =~ /ed$/ then
|
91
|
+
ret[i] = "VBN"
|
92
|
+
end
|
93
|
+
|
94
|
+
## rule 4: convert any type to adverb if it ends in "ly"
|
95
|
+
if text[i] =~ /ly$/ then
|
96
|
+
ret[i] = "RB"
|
97
|
+
end
|
98
|
+
|
99
|
+
## rule 5: convert a common noun (NN or NNS) to a adjective if
|
100
|
+
## it ends with "al"
|
101
|
+
if ret[i] =~ /^NN/ && text[i] =~ /al$/ then
|
102
|
+
ret[i] = "JJ"
|
103
|
+
end
|
104
|
+
|
105
|
+
## rule 6: convert a noun to a verb if the preceeding work is "would"
|
106
|
+
if i > 0 then
|
107
|
+
if ret[i] =~ /^NN/ then
|
108
|
+
if text[i-1].downcase == "would" then
|
109
|
+
ret[i] = "VB"
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
## rule 7: if a word has been categorized as a common noun and
|
115
|
+
## it ends with "s", then set its type to plural common noun (NNS)
|
116
|
+
if ret[i] == "NN" && text[i] =~ /s$/ then
|
117
|
+
ret[i] = "NNS"
|
118
|
+
end
|
119
|
+
|
120
|
+
## rule 8: convert a common noun to a present participle
|
121
|
+
## verb (i.e., a gerand)
|
122
|
+
if ret[i] =~ /^NN/ && text[i] =~ /ing$/ then
|
123
|
+
ret[i] = "VBG"
|
124
|
+
end
|
125
|
+
|
126
|
+
## rule 9: <noun> <noun 2> --> <noun> <verb> if <noun 2>
|
127
|
+
## can also be a verb
|
128
|
+
if i > 0 then
|
129
|
+
if ( ( ret[i-1] =~ /^NN/ ) && ( ret[i] =~ /^NN/ ) ) then
|
130
|
+
if @lexicon[text[i]].include?("VBN") then
|
131
|
+
ret[i] = "VBN"
|
132
|
+
end
|
133
|
+
if @lexicon[text[i]].include?("VBZ") then
|
134
|
+
ret[i] = "VBZ"
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
end
|
140
|
+
return ret
|
141
|
+
end # def getTag
|
142
|
+
|
143
|
+
end # class Tagger
|
144
|
+
|
145
|
+
end
|
146
|
+
|
147
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
|
2
|
+
module ContentFocus
|
3
|
+
module Version
|
4
|
+
MAJOR = 0
|
5
|
+
MINOR = 1
|
6
|
+
REVISION = 0
|
7
|
+
class << self
|
8
|
+
def to_version
|
9
|
+
"#{MAJOR}.#{MINOR}.#{REVISION}"
|
10
|
+
end
|
11
|
+
|
12
|
+
def to_name
|
13
|
+
"#{MAJOR}_#{MINOR}_#{REVISION}"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
require 'hpricot'
|
20
|
+
require File.join(File.dirname(__FILE__), 'content_focus/linguistics')
|
21
|
+
require File.join(File.dirname(__FILE__), 'content_focus/html')
|
@@ -0,0 +1,38 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'spec'
|
5
|
+
require File.join(File.dirname(__FILE__), '../lib/content_focus')
|
6
|
+
|
7
|
+
describe ContentFocus::Version, "#to_version" do
|
8
|
+
it "should return version in 'X.Y.Z' format" do
|
9
|
+
ContentFocus::Version.to_version.should =~ /\d+\.\d+\.\d+/
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
describe ContentFocus::HTML, '#static_text' do
|
14
|
+
|
15
|
+
{
|
16
|
+
'simple_with_navigation' => ['div', 'id', 'post_1'],
|
17
|
+
'kakuteru_article' => ['div', 'class', 'read page'],
|
18
|
+
'wordpress_article' => ['div', 'class', 'post-body entry-content'],
|
19
|
+
'typad_article' => ['div', 'class', 'content'],
|
20
|
+
'twitter_profile' => ['ul', 'class', 'about vcard entry-author'],
|
21
|
+
'kakuteru_index' => ['div', 'class', 'about'],
|
22
|
+
'google_code_statistics' => ['div', 'class', 'article'],
|
23
|
+
'wordpress_custom_article' => ['div', 'id', 'post-15361'],
|
24
|
+
'movable_type_article' => ['div', 'class', 'asset-more'],
|
25
|
+
#'movable_type_index' => ['div', 'id', 'home_posts_block'],
|
26
|
+
'confreaks' => ['td', 'id', nil]
|
27
|
+
}.each do |template,assertions|
|
28
|
+
|
29
|
+
it "#{template} should return #{assertions.inspect}" do
|
30
|
+
html_focus = ContentFocus::HTML.new(File.open(File.join(File.dirname(__FILE__), "data/#{template}.html")).read)
|
31
|
+
fragment = html_focus.static_fragment
|
32
|
+
element = fragment[:element]
|
33
|
+
element.name.should == assertions[0]
|
34
|
+
element.attributes[assertions[1]].should == assertions[2]
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|