content_focus 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,147 @@
1
+ # tagger.rb - a Ruby Part of Speech Tagger patterned on the work of Eric Brill. Version 0.1
2
+ #
3
+ # Copyright 2005 Mark Watson. All rights reserved.
4
+ # This software is released under the LGPL
5
+ #
6
+ # Contributor: Pat Eyler
7
+ #
8
+
9
+ module ContentFocus
10
+
11
+ module Linguistics
12
+
13
+ class Tagger
14
+
15
+ UNINTERESTING_PARTS_OF_SPEECH = ['DT', 'PRP', 'IN', 'CC', 'MD']
16
+
17
+ def initialize()
18
+ @lexicon = {}
19
+ file = File.new(File.join(File.dirname(__FILE__), 'lexicon.txt'), 'r')
20
+ file.each_line {|line|
21
+ toks=line.split
22
+ @lexicon[toks.shift]=toks
23
+ }
24
+ file.close
25
+ end
26
+
27
+ def tokenize(words)
28
+ words.split(/ |,|\.|\:|\;|\'/) #'
29
+ end
30
+
31
+ def self.keywords_for_caption(caption)
32
+ @@tagger ||= self.new
33
+ keywords = []
34
+ all_keywords = @@tagger.tokenize(caption)
35
+ pos_tags = @@tagger.part_of_speech_tag(all_keywords)
36
+ all_keywords.each_with_index do |keyword,i|
37
+ next if UNINTERESTING_PARTS_OF_SPEECH.include?(pos_tags[i])
38
+ next unless keyword.size > 4
39
+ keywords << keyword.downcase
40
+ end
41
+ keywords
42
+ rescue => e
43
+ all_keywords
44
+ end
45
+
46
+ def part_of_speech_tag(text)
47
+
48
+ ## start by tokenizing strings passed in
49
+ if text.class == String then
50
+ text = tokenize(text)
51
+ end
52
+
53
+ ## we only work on arrays. If text isn't an array,
54
+ ## quit now.
55
+ if text.class != Array then
56
+ raise RuntimeError, "can't tokenize #{text.class}"
57
+ end
58
+
59
+ # this looks like an artifact of testing
60
+ # puts "text:",text,"\n"
61
+ ret = []
62
+
63
+ text.each do
64
+ |w| ret << (@lexicon[w] && @lexicon[w][0]) ||
65
+ (@lexicon[w.downcase] && words[w.downcase][0]) ||
66
+ 'NN'
67
+ end
68
+
69
+ ## Now, apply transformational rules:
70
+ text.length.times do |i|
71
+
72
+ ## rule 1: DT, {VBD | VBP} --> DT, NN
73
+ if i > 0 then
74
+ if ret[i - 1] == "DT" then
75
+ if ret[i] == "VBD" or ret[i] == "VBP" or ret[i] == "VB" then
76
+ ret[i] = "NN"
77
+ end
78
+ end
79
+ end
80
+
81
+ ## rule 2: convert a noun to a number (CD) if "." appears in the word
82
+ if ret[i] =~ /^N/ then
83
+ if text[i] =~ /\./ then
84
+ ret[i] = "CD"
85
+ end
86
+ end
87
+
88
+ ## rule 3: convert a noun to a past participle if words[i] ends
89
+ ## with "ed"
90
+ if ret[i] =~ /^N/ && text[i] =~ /ed$/ then
91
+ ret[i] = "VBN"
92
+ end
93
+
94
+ ## rule 4: convert any type to adverb if it ends in "ly"
95
+ if text[i] =~ /ly$/ then
96
+ ret[i] = "RB"
97
+ end
98
+
99
+ ## rule 5: convert a common noun (NN or NNS) to a adjective if
100
+ ## it ends with "al"
101
+ if ret[i] =~ /^NN/ && text[i] =~ /al$/ then
102
+ ret[i] = "JJ"
103
+ end
104
+
105
+ ## rule 6: convert a noun to a verb if the preceeding work is "would"
106
+ if i > 0 then
107
+ if ret[i] =~ /^NN/ then
108
+ if text[i-1].downcase == "would" then
109
+ ret[i] = "VB"
110
+ end
111
+ end
112
+ end
113
+
114
+ ## rule 7: if a word has been categorized as a common noun and
115
+ ## it ends with "s", then set its type to plural common noun (NNS)
116
+ if ret[i] == "NN" && text[i] =~ /s$/ then
117
+ ret[i] = "NNS"
118
+ end
119
+
120
+ ## rule 8: convert a common noun to a present participle
121
+ ## verb (i.e., a gerand)
122
+ if ret[i] =~ /^NN/ && text[i] =~ /ing$/ then
123
+ ret[i] = "VBG"
124
+ end
125
+
126
+ ## rule 9: <noun> <noun 2> --> <noun> <verb> if <noun 2>
127
+ ## can also be a verb
128
+ if i > 0 then
129
+ if ( ( ret[i-1] =~ /^NN/ ) && ( ret[i] =~ /^NN/ ) ) then
130
+ if @lexicon[text[i]].include?("VBN") then
131
+ ret[i] = "VBN"
132
+ end
133
+ if @lexicon[text[i]].include?("VBZ") then
134
+ ret[i] = "VBZ"
135
+ end
136
+ end
137
+ end
138
+
139
+ end
140
+ return ret
141
+ end # def getTag
142
+
143
+ end # class Tagger
144
+
145
+ end
146
+
147
+ end
@@ -0,0 +1,21 @@
1
+
2
+ module ContentFocus
3
+ module Version
4
+ MAJOR = 0
5
+ MINOR = 1
6
+ REVISION = 0
7
+ class << self
8
+ def to_version
9
+ "#{MAJOR}.#{MINOR}.#{REVISION}"
10
+ end
11
+
12
+ def to_name
13
+ "#{MAJOR}_#{MINOR}_#{REVISION}"
14
+ end
15
+ end
16
+ end
17
+ end
18
+
19
+ require 'hpricot'
20
+ require File.join(File.dirname(__FILE__), 'content_focus/linguistics')
21
+ require File.join(File.dirname(__FILE__), 'content_focus/html')
@@ -0,0 +1,38 @@
1
+
2
+
3
+ require 'rubygems'
4
+ require 'spec'
5
+ require File.join(File.dirname(__FILE__), '../lib/content_focus')
6
+
7
+ describe ContentFocus::Version, "#to_version" do
8
+ it "should return version in 'X.Y.Z' format" do
9
+ ContentFocus::Version.to_version.should =~ /\d+\.\d+\.\d+/
10
+ end
11
+ end
12
+
13
+ describe ContentFocus::HTML, '#static_text' do
14
+
15
+ {
16
+ 'simple_with_navigation' => ['div', 'id', 'post_1'],
17
+ 'kakuteru_article' => ['div', 'class', 'read page'],
18
+ 'wordpress_article' => ['div', 'class', 'post-body entry-content'],
19
+ 'typad_article' => ['div', 'class', 'content'],
20
+ 'twitter_profile' => ['ul', 'class', 'about vcard entry-author'],
21
+ 'kakuteru_index' => ['div', 'class', 'about'],
22
+ 'google_code_statistics' => ['div', 'class', 'article'],
23
+ 'wordpress_custom_article' => ['div', 'id', 'post-15361'],
24
+ 'movable_type_article' => ['div', 'class', 'asset-more'],
25
+ #'movable_type_index' => ['div', 'id', 'home_posts_block'],
26
+ 'confreaks' => ['td', 'id', nil]
27
+ }.each do |template,assertions|
28
+
29
+ it "#{template} should return #{assertions.inspect}" do
30
+ html_focus = ContentFocus::HTML.new(File.open(File.join(File.dirname(__FILE__), "data/#{template}.html")).read)
31
+ fragment = html_focus.static_fragment
32
+ element = fragment[:element]
33
+ element.name.should == assertions[0]
34
+ element.attributes[assertions[1]].should == assertions[2]
35
+ end
36
+
37
+ end
38
+ end