content_focus 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,147 @@
1
+ # tagger.rb - a Ruby Part of Speech Tagger patterned on the work of Eric Brill. Version 0.1
2
+ #
3
+ # Copyright 2005 Mark Watson. All rights reserved.
4
+ # This software is released under the LGPL
5
+ #
6
+ # Contributor: Pat Eyler
7
+ #
8
+
9
+ module ContentFocus
10
+
11
+ module Linguistics
12
+
13
+ class Tagger
14
+
15
+ UNINTERESTING_PARTS_OF_SPEECH = ['DT', 'PRP', 'IN', 'CC', 'MD']
16
+
17
+ def initialize()
18
+ @lexicon = {}
19
+ file = File.new(File.join(File.dirname(__FILE__), 'lexicon.txt'), 'r')
20
+ file.each_line {|line|
21
+ toks=line.split
22
+ @lexicon[toks.shift]=toks
23
+ }
24
+ file.close
25
+ end
26
+
27
+ def tokenize(words)
28
+ words.split(/ |,|\.|\:|\;|\'/) #'
29
+ end
30
+
31
+ def self.keywords_for_caption(caption)
32
+ @@tagger ||= self.new
33
+ keywords = []
34
+ all_keywords = @@tagger.tokenize(caption)
35
+ pos_tags = @@tagger.part_of_speech_tag(all_keywords)
36
+ all_keywords.each_with_index do |keyword,i|
37
+ next if UNINTERESTING_PARTS_OF_SPEECH.include?(pos_tags[i])
38
+ next unless keyword.size > 4
39
+ keywords << keyword.downcase
40
+ end
41
+ keywords
42
+ rescue => e
43
+ all_keywords
44
+ end
45
+
46
+ def part_of_speech_tag(text)
47
+
48
+ ## start by tokenizing strings passed in
49
+ if text.class == String then
50
+ text = tokenize(text)
51
+ end
52
+
53
+ ## we only work on arrays. If text isn't an array,
54
+ ## quit now.
55
+ if text.class != Array then
56
+ raise RuntimeError, "can't tokenize #{text.class}"
57
+ end
58
+
59
+ # this looks like an artifact of testing
60
+ # puts "text:",text,"\n"
61
+ ret = []
62
+
63
+ text.each do
64
+ |w| ret << (@lexicon[w] && @lexicon[w][0]) ||
65
+ (@lexicon[w.downcase] && words[w.downcase][0]) ||
66
+ 'NN'
67
+ end
68
+
69
+ ## Now, apply transformational rules:
70
+ text.length.times do |i|
71
+
72
+ ## rule 1: DT, {VBD | VBP} --> DT, NN
73
+ if i > 0 then
74
+ if ret[i - 1] == "DT" then
75
+ if ret[i] == "VBD" or ret[i] == "VBP" or ret[i] == "VB" then
76
+ ret[i] = "NN"
77
+ end
78
+ end
79
+ end
80
+
81
+ ## rule 2: convert a noun to a number (CD) if "." appears in the word
82
+ if ret[i] =~ /^N/ then
83
+ if text[i] =~ /\./ then
84
+ ret[i] = "CD"
85
+ end
86
+ end
87
+
88
+ ## rule 3: convert a noun to a past participle if words[i] ends
89
+ ## with "ed"
90
+ if ret[i] =~ /^N/ && text[i] =~ /ed$/ then
91
+ ret[i] = "VBN"
92
+ end
93
+
94
+ ## rule 4: convert any type to adverb if it ends in "ly"
95
+ if text[i] =~ /ly$/ then
96
+ ret[i] = "RB"
97
+ end
98
+
99
+ ## rule 5: convert a common noun (NN or NNS) to a adjective if
100
+ ## it ends with "al"
101
+ if ret[i] =~ /^NN/ && text[i] =~ /al$/ then
102
+ ret[i] = "JJ"
103
+ end
104
+
105
+ ## rule 6: convert a noun to a verb if the preceeding work is "would"
106
+ if i > 0 then
107
+ if ret[i] =~ /^NN/ then
108
+ if text[i-1].downcase == "would" then
109
+ ret[i] = "VB"
110
+ end
111
+ end
112
+ end
113
+
114
+ ## rule 7: if a word has been categorized as a common noun and
115
+ ## it ends with "s", then set its type to plural common noun (NNS)
116
+ if ret[i] == "NN" && text[i] =~ /s$/ then
117
+ ret[i] = "NNS"
118
+ end
119
+
120
+ ## rule 8: convert a common noun to a present participle
121
+ ## verb (i.e., a gerand)
122
+ if ret[i] =~ /^NN/ && text[i] =~ /ing$/ then
123
+ ret[i] = "VBG"
124
+ end
125
+
126
+ ## rule 9: <noun> <noun 2> --> <noun> <verb> if <noun 2>
127
+ ## can also be a verb
128
+ if i > 0 then
129
+ if ( ( ret[i-1] =~ /^NN/ ) && ( ret[i] =~ /^NN/ ) ) then
130
+ if @lexicon[text[i]].include?("VBN") then
131
+ ret[i] = "VBN"
132
+ end
133
+ if @lexicon[text[i]].include?("VBZ") then
134
+ ret[i] = "VBZ"
135
+ end
136
+ end
137
+ end
138
+
139
+ end
140
+ return ret
141
+ end # def getTag
142
+
143
+ end # class Tagger
144
+
145
+ end
146
+
147
+ end
@@ -0,0 +1,21 @@
1
+
2
+ module ContentFocus
3
+ module Version
4
+ MAJOR = 0
5
+ MINOR = 1
6
+ REVISION = 0
7
+ class << self
8
+ def to_version
9
+ "#{MAJOR}.#{MINOR}.#{REVISION}"
10
+ end
11
+
12
+ def to_name
13
+ "#{MAJOR}_#{MINOR}_#{REVISION}"
14
+ end
15
+ end
16
+ end
17
+ end
18
+
19
+ require 'hpricot'
20
+ require File.join(File.dirname(__FILE__), 'content_focus/linguistics')
21
+ require File.join(File.dirname(__FILE__), 'content_focus/html')
@@ -0,0 +1,38 @@
1
+
2
+
3
+ require 'rubygems'
4
+ require 'spec'
5
+ require File.join(File.dirname(__FILE__), '../lib/content_focus')
6
+
7
+ describe ContentFocus::Version, "#to_version" do
8
+ it "should return version in 'X.Y.Z' format" do
9
+ ContentFocus::Version.to_version.should =~ /\d+\.\d+\.\d+/
10
+ end
11
+ end
12
+
13
+ describe ContentFocus::HTML, '#static_text' do
14
+
15
+ {
16
+ 'simple_with_navigation' => ['div', 'id', 'post_1'],
17
+ 'kakuteru_article' => ['div', 'class', 'read page'],
18
+ 'wordpress_article' => ['div', 'class', 'post-body entry-content'],
19
+ 'typad_article' => ['div', 'class', 'content'],
20
+ 'twitter_profile' => ['ul', 'class', 'about vcard entry-author'],
21
+ 'kakuteru_index' => ['div', 'class', 'about'],
22
+ 'google_code_statistics' => ['div', 'class', 'article'],
23
+ 'wordpress_custom_article' => ['div', 'id', 'post-15361'],
24
+ 'movable_type_article' => ['div', 'class', 'asset-more'],
25
+ #'movable_type_index' => ['div', 'id', 'home_posts_block'],
26
+ 'confreaks' => ['td', 'id', nil]
27
+ }.each do |template,assertions|
28
+
29
+ it "#{template} should return #{assertions.inspect}" do
30
+ html_focus = ContentFocus::HTML.new(File.open(File.join(File.dirname(__FILE__), "data/#{template}.html")).read)
31
+ fragment = html_focus.static_fragment
32
+ element = fragment[:element]
33
+ element.name.should == assertions[0]
34
+ element.attributes[assertions[1]].should == assertions[2]
35
+ end
36
+
37
+ end
38
+ end