content_focus 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +33 -0
- data/examples/parse_twitter_profile.rb +11 -0
- data/lib/content_focus/html.rb +309 -0
- data/lib/content_focus/lexicon.txt +92662 -0
- data/lib/content_focus/linguistics.rb +147 -0
- data/lib/content_focus.rb +21 -0
- data/spec/content_focus_spec.rb +38 -0
- data/spec/data/confreaks.html +2634 -0
- data/spec/data/google_code_statistics.html +171 -0
- data/spec/data/kakuteru_article.html +199 -0
- data/spec/data/kakuteru_index.html +626 -0
- data/spec/data/movable_type_article.html +1243 -0
- data/spec/data/movable_type_index.html +1503 -0
- data/spec/data/simple_with_navigation.html +24 -0
- data/spec/data/twitter_profile.html +548 -0
- data/spec/data/typad_article.html +1421 -0
- data/spec/data/wordpress_article.html +2004 -0
- data/spec/data/wordpress_custom_article.html +527 -0
- metadata +83 -0
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
# tagger.rb - a Ruby Part of Speech Tagger patterned on the work of Eric Brill. Version 0.1
|
|
2
|
+
#
|
|
3
|
+
# Copyright 2005 Mark Watson. All rights reserved.
|
|
4
|
+
# This software is released under the LGPL
|
|
5
|
+
#
|
|
6
|
+
# Contributor: Pat Eyler
|
|
7
|
+
#
|
|
8
|
+
|
|
9
|
+
module ContentFocus
|
|
10
|
+
|
|
11
|
+
module Linguistics
|
|
12
|
+
|
|
13
|
+
class Tagger
|
|
14
|
+
|
|
15
|
+
UNINTERESTING_PARTS_OF_SPEECH = ['DT', 'PRP', 'IN', 'CC', 'MD']
|
|
16
|
+
|
|
17
|
+
def initialize()
|
|
18
|
+
@lexicon = {}
|
|
19
|
+
file = File.new(File.join(File.dirname(__FILE__), 'lexicon.txt'), 'r')
|
|
20
|
+
file.each_line {|line|
|
|
21
|
+
toks=line.split
|
|
22
|
+
@lexicon[toks.shift]=toks
|
|
23
|
+
}
|
|
24
|
+
file.close
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def tokenize(words)
|
|
28
|
+
words.split(/ |,|\.|\:|\;|\'/) #'
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def self.keywords_for_caption(caption)
|
|
32
|
+
@@tagger ||= self.new
|
|
33
|
+
keywords = []
|
|
34
|
+
all_keywords = @@tagger.tokenize(caption)
|
|
35
|
+
pos_tags = @@tagger.part_of_speech_tag(all_keywords)
|
|
36
|
+
all_keywords.each_with_index do |keyword,i|
|
|
37
|
+
next if UNINTERESTING_PARTS_OF_SPEECH.include?(pos_tags[i])
|
|
38
|
+
next unless keyword.size > 4
|
|
39
|
+
keywords << keyword.downcase
|
|
40
|
+
end
|
|
41
|
+
keywords
|
|
42
|
+
rescue => e
|
|
43
|
+
all_keywords
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def part_of_speech_tag(text)
|
|
47
|
+
|
|
48
|
+
## start by tokenizing strings passed in
|
|
49
|
+
if text.class == String then
|
|
50
|
+
text = tokenize(text)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
## we only work on arrays. If text isn't an array,
|
|
54
|
+
## quit now.
|
|
55
|
+
if text.class != Array then
|
|
56
|
+
raise RuntimeError, "can't tokenize #{text.class}"
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# this looks like an artifact of testing
|
|
60
|
+
# puts "text:",text,"\n"
|
|
61
|
+
ret = []
|
|
62
|
+
|
|
63
|
+
text.each do
|
|
64
|
+
|w| ret << (@lexicon[w] && @lexicon[w][0]) ||
|
|
65
|
+
(@lexicon[w.downcase] && words[w.downcase][0]) ||
|
|
66
|
+
'NN'
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
## Now, apply transformational rules:
|
|
70
|
+
text.length.times do |i|
|
|
71
|
+
|
|
72
|
+
## rule 1: DT, {VBD | VBP} --> DT, NN
|
|
73
|
+
if i > 0 then
|
|
74
|
+
if ret[i - 1] == "DT" then
|
|
75
|
+
if ret[i] == "VBD" or ret[i] == "VBP" or ret[i] == "VB" then
|
|
76
|
+
ret[i] = "NN"
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
## rule 2: convert a noun to a number (CD) if "." appears in the word
|
|
82
|
+
if ret[i] =~ /^N/ then
|
|
83
|
+
if text[i] =~ /\./ then
|
|
84
|
+
ret[i] = "CD"
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
## rule 3: convert a noun to a past participle if words[i] ends
|
|
89
|
+
## with "ed"
|
|
90
|
+
if ret[i] =~ /^N/ && text[i] =~ /ed$/ then
|
|
91
|
+
ret[i] = "VBN"
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
## rule 4: convert any type to adverb if it ends in "ly"
|
|
95
|
+
if text[i] =~ /ly$/ then
|
|
96
|
+
ret[i] = "RB"
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
## rule 5: convert a common noun (NN or NNS) to a adjective if
|
|
100
|
+
## it ends with "al"
|
|
101
|
+
if ret[i] =~ /^NN/ && text[i] =~ /al$/ then
|
|
102
|
+
ret[i] = "JJ"
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
## rule 6: convert a noun to a verb if the preceeding work is "would"
|
|
106
|
+
if i > 0 then
|
|
107
|
+
if ret[i] =~ /^NN/ then
|
|
108
|
+
if text[i-1].downcase == "would" then
|
|
109
|
+
ret[i] = "VB"
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
## rule 7: if a word has been categorized as a common noun and
|
|
115
|
+
## it ends with "s", then set its type to plural common noun (NNS)
|
|
116
|
+
if ret[i] == "NN" && text[i] =~ /s$/ then
|
|
117
|
+
ret[i] = "NNS"
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
## rule 8: convert a common noun to a present participle
|
|
121
|
+
## verb (i.e., a gerand)
|
|
122
|
+
if ret[i] =~ /^NN/ && text[i] =~ /ing$/ then
|
|
123
|
+
ret[i] = "VBG"
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
## rule 9: <noun> <noun 2> --> <noun> <verb> if <noun 2>
|
|
127
|
+
## can also be a verb
|
|
128
|
+
if i > 0 then
|
|
129
|
+
if ( ( ret[i-1] =~ /^NN/ ) && ( ret[i] =~ /^NN/ ) ) then
|
|
130
|
+
if @lexicon[text[i]].include?("VBN") then
|
|
131
|
+
ret[i] = "VBN"
|
|
132
|
+
end
|
|
133
|
+
if @lexicon[text[i]].include?("VBZ") then
|
|
134
|
+
ret[i] = "VBZ"
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
end
|
|
140
|
+
return ret
|
|
141
|
+
end # def getTag
|
|
142
|
+
|
|
143
|
+
end # class Tagger
|
|
144
|
+
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
|
|
2
|
+
module ContentFocus
|
|
3
|
+
module Version
|
|
4
|
+
MAJOR = 0
|
|
5
|
+
MINOR = 1
|
|
6
|
+
REVISION = 0
|
|
7
|
+
class << self
|
|
8
|
+
def to_version
|
|
9
|
+
"#{MAJOR}.#{MINOR}.#{REVISION}"
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def to_name
|
|
13
|
+
"#{MAJOR}_#{MINOR}_#{REVISION}"
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
require 'hpricot'
|
|
20
|
+
require File.join(File.dirname(__FILE__), 'content_focus/linguistics')
|
|
21
|
+
require File.join(File.dirname(__FILE__), 'content_focus/html')
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
require 'rubygems'
|
|
4
|
+
require 'spec'
|
|
5
|
+
require File.join(File.dirname(__FILE__), '../lib/content_focus')
|
|
6
|
+
|
|
7
|
+
describe ContentFocus::Version, "#to_version" do
|
|
8
|
+
it "should return version in 'X.Y.Z' format" do
|
|
9
|
+
ContentFocus::Version.to_version.should =~ /\d+\.\d+\.\d+/
|
|
10
|
+
end
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
describe ContentFocus::HTML, '#static_text' do
|
|
14
|
+
|
|
15
|
+
{
|
|
16
|
+
'simple_with_navigation' => ['div', 'id', 'post_1'],
|
|
17
|
+
'kakuteru_article' => ['div', 'class', 'read page'],
|
|
18
|
+
'wordpress_article' => ['div', 'class', 'post-body entry-content'],
|
|
19
|
+
'typad_article' => ['div', 'class', 'content'],
|
|
20
|
+
'twitter_profile' => ['ul', 'class', 'about vcard entry-author'],
|
|
21
|
+
'kakuteru_index' => ['div', 'class', 'about'],
|
|
22
|
+
'google_code_statistics' => ['div', 'class', 'article'],
|
|
23
|
+
'wordpress_custom_article' => ['div', 'id', 'post-15361'],
|
|
24
|
+
'movable_type_article' => ['div', 'class', 'asset-more'],
|
|
25
|
+
#'movable_type_index' => ['div', 'id', 'home_posts_block'],
|
|
26
|
+
'confreaks' => ['td', 'id', nil]
|
|
27
|
+
}.each do |template,assertions|
|
|
28
|
+
|
|
29
|
+
it "#{template} should return #{assertions.inspect}" do
|
|
30
|
+
html_focus = ContentFocus::HTML.new(File.open(File.join(File.dirname(__FILE__), "data/#{template}.html")).read)
|
|
31
|
+
fragment = html_focus.static_fragment
|
|
32
|
+
element = fragment[:element]
|
|
33
|
+
element.name.should == assertions[0]
|
|
34
|
+
element.attributes[assertions[1]].should == assertions[2]
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
end
|
|
38
|
+
end
|