content_focus 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +33 -0
- data/examples/parse_twitter_profile.rb +11 -0
- data/lib/content_focus/html.rb +309 -0
- data/lib/content_focus/lexicon.txt +92662 -0
- data/lib/content_focus/linguistics.rb +147 -0
- data/lib/content_focus.rb +21 -0
- data/spec/content_focus_spec.rb +38 -0
- data/spec/data/confreaks.html +2634 -0
- data/spec/data/google_code_statistics.html +171 -0
- data/spec/data/kakuteru_article.html +199 -0
- data/spec/data/kakuteru_index.html +626 -0
- data/spec/data/movable_type_article.html +1243 -0
- data/spec/data/movable_type_index.html +1503 -0
- data/spec/data/simple_with_navigation.html +24 -0
- data/spec/data/twitter_profile.html +548 -0
- data/spec/data/typad_article.html +1421 -0
- data/spec/data/wordpress_article.html +2004 -0
- data/spec/data/wordpress_custom_article.html +527 -0
- metadata +83 -0
data/README.textile
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
|
2
|
+
h1. Content Focus
|
3
|
+
|
4
|
+
This is a little gem that allows you to input raw HTML and extract the most relevant piece of content. This is useful when doing semantic analysis on HTML pages for example.
|
5
|
+
|
6
|
+
Right now, ContentFocus only supports 'permanent content extraction'. This is the content that's non-temporal on a page, like for example:
|
7
|
+
|
8
|
+
* About section
|
9
|
+
* Author information
|
10
|
+
* Article body
|
11
|
+
* Generic information block
|
12
|
+
|
13
|
+
The algorithm uses several ways of determining this and it will try to neglect irrelevant pieces of content (navigation, styling, etc.)
|
14
|
+
|
15
|
+
h2. Example
|
16
|
+
|
17
|
+
<pre><code>
|
18
|
+
require 'rubygems'
|
19
|
+
require 'content_focus'
|
20
|
+
|
21
|
+
content_focus = ContentFocus::HTML.new(html_data)
|
22
|
+
|
23
|
+
# Will return the most relevant content in text
|
24
|
+
static_text = content_focus.static_text
|
25
|
+
|
26
|
+
# Will return the most relevant block of content in a Hpricot HTML tree element
|
27
|
+
static_fragment = content_focus.static_fragment
|
28
|
+
</code></pre>
|
29
|
+
|
30
|
+
h2. Author
|
31
|
+
|
32
|
+
Dominiek ter Heide
|
33
|
+
(Note: I wrote this a while back and thought this could be useful to some developers)
|
@@ -0,0 +1,11 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'content_focus'
|
3
|
+
|
4
|
+
html_data = File.open(File.join(File.dirname(__FILE__), '../spec/data/twitter_profile.html')).read
|
5
|
+
content_focus = ContentFocus::HTML.new(html_data)
|
6
|
+
|
7
|
+
# Will return the most relevant content in text
|
8
|
+
static_text = content_focus.static_text
|
9
|
+
|
10
|
+
# Will return the most relevant block of content in a Hpricot HTML tree element
|
11
|
+
static_fragment = content_focus.static_fragment
|
@@ -0,0 +1,309 @@
|
|
1
|
+
|
2
|
+
module ContentFocus
|
3
|
+
|
4
|
+
##
|
5
|
+
# Static content fragments are things like: title, about, author, content of an article, etc.
|
6
|
+
#
|
7
|
+
class HTML
|
8
|
+
|
9
|
+
def initialize(html)
|
10
|
+
@doc = Hpricot(html)
|
11
|
+
end
|
12
|
+
|
13
|
+
def static_text
|
14
|
+
fragment = self.static_fragment
|
15
|
+
fragment ? fragment[:element].inner_text.strip! : nil
|
16
|
+
end
|
17
|
+
|
18
|
+
# Based on the title, find a common chunk of HTML that is the most relevant
|
19
|
+
# This is to extract atomic/permanent content
|
20
|
+
def static_fragment(options = {})
|
21
|
+
fragments = self.static_fragments(options)
|
22
|
+
|
23
|
+
return nil if fragments == nil || fragments.empty?
|
24
|
+
|
25
|
+
if fragments.size == 1
|
26
|
+
return fragments.first
|
27
|
+
end
|
28
|
+
|
29
|
+
# Find common ancestors
|
30
|
+
fragments_by_parents = {}
|
31
|
+
fragments.each do |fragment|
|
32
|
+
next unless fragment[:parent]
|
33
|
+
fragments_by_parents[fragment[:parent]] ||= []
|
34
|
+
fragments_by_parents[fragment[:parent]] << fragment
|
35
|
+
end
|
36
|
+
|
37
|
+
# Find the top parent
|
38
|
+
top_fragments = []
|
39
|
+
top_parent_fragments_count = 0
|
40
|
+
fragments_by_parents.each do |parent,fr|
|
41
|
+
if fr.size > top_parent_fragments_count
|
42
|
+
top_parent_fragments_count = fr.size
|
43
|
+
top_fragments = fr
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# Failed?
|
48
|
+
if !top_fragments || top_fragments.empty?
|
49
|
+
return fragments.first
|
50
|
+
end
|
51
|
+
|
52
|
+
# Create a combined fragment with combined score
|
53
|
+
element = top_fragments.first[:element]
|
54
|
+
combined_fragment = {:score => 0, :element => element.parent, :inner_text => element.parent.inner_text, :parent => element.parent ? element.parent.object_id : nil}
|
55
|
+
top_fragments.each { |f| combined_fragment[:score] = combined_fragment[:score] + f[:score] }
|
56
|
+
|
57
|
+
# De-value the body tag
|
58
|
+
if combined_fragment[:element].name == 'body'
|
59
|
+
combined_fragment[:score] = top_fragments.size
|
60
|
+
end
|
61
|
+
|
62
|
+
# Add combined fragment to pool and re-order by score.
|
63
|
+
fragments << combined_fragment
|
64
|
+
fragments.sort! { |b,a| a[:score] <=> b[:score] }
|
65
|
+
|
66
|
+
#puts fragments.collect { |f| ["#{f[:element].parent ? f[:element].parent.name : nil}:#{f[:parent]}", f[:element].name + '(' + f[:score].to_s + '): ', f[:element].attributes] }.inspect
|
67
|
+
|
68
|
+
fragments.first
|
69
|
+
end
|
70
|
+
|
71
|
+
# Get all relevant div/span/td/body/p blocks from the HTML page - based on the <title>
|
72
|
+
# This is to extract atomic/permanent content
|
73
|
+
def static_fragments(options = {})
|
74
|
+
title_elements = (@doc/"title")
|
75
|
+
return html if !title_elements || title_elements.empty?
|
76
|
+
title_inner_text = title_elements.first.inner_text
|
77
|
+
keywords = Linguistics::Tagger.keywords_for_caption(title_inner_text)
|
78
|
+
blocks = []
|
79
|
+
|
80
|
+
# First, find the smallest blocks, but bigger than the title
|
81
|
+
(@doc/"div|span|td|body|p|dd|ul").each do |element|
|
82
|
+
|
83
|
+
next if element_with_negative_identifier(element)
|
84
|
+
|
85
|
+
inner_text = ''
|
86
|
+
element.children.each do |child|
|
87
|
+
inner_text << child.to_s if child.is_a?(Hpricot::Text)
|
88
|
+
end
|
89
|
+
inner_text.downcase!
|
90
|
+
next if inner_text.size <= title_inner_text.size
|
91
|
+
|
92
|
+
# Check the occurance of keyword in block, skip if none
|
93
|
+
num_matches = 0
|
94
|
+
keywords.each { |k| num_matches+=1 if inner_text.split(/\s+/).include?(k) }
|
95
|
+
next if num_matches == 0
|
96
|
+
|
97
|
+
# Calculate a score based on keyword matches times positive naming of id/class
|
98
|
+
score = num_matches
|
99
|
+
identifier = nil
|
100
|
+
if (identifier = element_with_positive_identifier(element))
|
101
|
+
score = score * 2;
|
102
|
+
end
|
103
|
+
|
104
|
+
blocks << {:score => score, :element => element, :inner_text => inner_text, :parent => element.parent ? element.parent.object_id : nil, :identifier => identifier}
|
105
|
+
end
|
106
|
+
|
107
|
+
big_block_identifiers = {}
|
108
|
+
|
109
|
+
# Finding big blocks with both matches and positive identifiers
|
110
|
+
(@doc/"div|span|table|td|body|p|dd|ul").each do |element|
|
111
|
+
|
112
|
+
next if element_with_negative_identifier(element)
|
113
|
+
|
114
|
+
# Need to log identifier statistics
|
115
|
+
identifier = nil
|
116
|
+
if (identifier = element_with_positive_identifier(element))
|
117
|
+
big_block_identifiers[identifier] ||= 0
|
118
|
+
big_block_identifiers[identifier] += 1
|
119
|
+
else
|
120
|
+
next
|
121
|
+
end
|
122
|
+
|
123
|
+
inner_text = element.inner_text
|
124
|
+
inner_text.downcase!
|
125
|
+
next if inner_text.size <= title_inner_text.size
|
126
|
+
|
127
|
+
# Check the occurance of keyword in block, skip if none
|
128
|
+
num_matches = 0
|
129
|
+
keywords.each { |k| num_matches+=1 if inner_text.split(/\s+/).include?(k) }
|
130
|
+
|
131
|
+
#puts "#{element.name}(#{element.inner_text.size}/#{title_inner_text.size}, score:#{num_matches} * #{element_with_positive_identifier(element)}): " + element.attributes['class'].to_s
|
132
|
+
|
133
|
+
next if num_matches == 0
|
134
|
+
|
135
|
+
# Calculate a score based on keyword matches times positive naming of id/class
|
136
|
+
score = num_matches
|
137
|
+
if identifier
|
138
|
+
score = score * 3;
|
139
|
+
end
|
140
|
+
|
141
|
+
blocks << {:score => score, :element => element, :inner_text => inner_text, :parent => element.parent ? element.parent : nil, :identifier => identifier}
|
142
|
+
end
|
143
|
+
|
144
|
+
# De-value the identifiers that are repeated
|
145
|
+
blocks.each do |block|
|
146
|
+
if block[:identifier] && big_block_identifiers[block[:identifier]].to_i > 1
|
147
|
+
block[:score] = block[:score] / 3;
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
# Order those blocks by top matches
|
152
|
+
blocks.sort! { |b,a| a[:score] <=> b[:score] }
|
153
|
+
blocks.reject! { |b| b[:score] == 0 }
|
154
|
+
blocks
|
155
|
+
end
|
156
|
+
|
157
|
+
private
|
158
|
+
|
159
|
+
def element_with_positive_identifier(element)
|
160
|
+
identifiers_for_element(element).each do |identifier|
|
161
|
+
return identifier if POSITIVE_IDENTIFIERS.include?(identifier)
|
162
|
+
end
|
163
|
+
return false
|
164
|
+
end
|
165
|
+
|
166
|
+
def element_with_negative_identifier(element)
|
167
|
+
identifiers_for_element(element).each do |identifier|
|
168
|
+
return identifier if NEGATIVE_IDENTIFIERS.include?(identifier)
|
169
|
+
end
|
170
|
+
return false
|
171
|
+
end
|
172
|
+
|
173
|
+
def identifiers_for_element(element)
|
174
|
+
identifiers = []
|
175
|
+
identifiers << element.attributes['id'] if element.attributes['id']
|
176
|
+
if element.attributes['class']
|
177
|
+
klasses = element.attributes['class']
|
178
|
+
klasses.split(/\s+/).each { |k| identifiers << k }
|
179
|
+
end
|
180
|
+
identifiers
|
181
|
+
end
|
182
|
+
|
183
|
+
# From: http://westciv.typepad.com/dog_or_higher/2005/11/real_world_sema.html
|
184
|
+
# Thanks go out to http://twitter.com/kimtaro
|
185
|
+
|
186
|
+
POSITIVE_IDENTIFIERS = [
|
187
|
+
'about',
|
188
|
+
'entry',
|
189
|
+
'description',
|
190
|
+
'bodytext',
|
191
|
+
'post',
|
192
|
+
'author',
|
193
|
+
'caption',
|
194
|
+
'read',
|
195
|
+
'summary',
|
196
|
+
'maintext',
|
197
|
+
'entry-body',
|
198
|
+
'entry-content',
|
199
|
+
'entry-author',
|
200
|
+
'vcard',
|
201
|
+
'article'
|
202
|
+
]
|
203
|
+
|
204
|
+
NEGATIVE_IDENTIFIERS = [
|
205
|
+
'navigation',
|
206
|
+
'help',
|
207
|
+
'noMargin',
|
208
|
+
'prefill',
|
209
|
+
'button',
|
210
|
+
'Menu',
|
211
|
+
'searchFormSection',
|
212
|
+
'rightAnchor',
|
213
|
+
'seeAllLink',
|
214
|
+
'seeAllBullet',
|
215
|
+
'adSpacer',
|
216
|
+
'nav',
|
217
|
+
'ocDCP',
|
218
|
+
'Date',
|
219
|
+
'CIPpromo',
|
220
|
+
'small',
|
221
|
+
'copyright',
|
222
|
+
'tiny',
|
223
|
+
'link',
|
224
|
+
'search',
|
225
|
+
'links',
|
226
|
+
'topMenu',
|
227
|
+
'left',
|
228
|
+
'more',
|
229
|
+
'smalltext',
|
230
|
+
'prnav',
|
231
|
+
'prred',
|
232
|
+
'logo',
|
233
|
+
'spacer',
|
234
|
+
'MsoNormal',
|
235
|
+
'searchbox',
|
236
|
+
'leftnav',
|
237
|
+
'inputbox',
|
238
|
+
'topnav',
|
239
|
+
'back',
|
240
|
+
'searchinput',
|
241
|
+
'border',
|
242
|
+
'side',
|
243
|
+
'selected',
|
244
|
+
'icons',
|
245
|
+
'helpblk',
|
246
|
+
'ebcPic',
|
247
|
+
'ebPicture',
|
248
|
+
'visual',
|
249
|
+
'topmenu-spacer',
|
250
|
+
'submenu',
|
251
|
+
'input',
|
252
|
+
'navbar',
|
253
|
+
'calendar',
|
254
|
+
'formbut',
|
255
|
+
'breadcrumb',
|
256
|
+
'navlinks',
|
257
|
+
'nwslink',
|
258
|
+
'leftmenu',
|
259
|
+
'rub1',
|
260
|
+
'cbox',
|
261
|
+
'ta-c',
|
262
|
+
'formtext',
|
263
|
+
'mainmenu',
|
264
|
+
'cal',
|
265
|
+
'searchtext',
|
266
|
+
'sidebar',
|
267
|
+
'powered',
|
268
|
+
'imagealign',
|
269
|
+
'ckCol',
|
270
|
+
'binImg',
|
271
|
+
'tm',
|
272
|
+
'searchform',
|
273
|
+
'separator',
|
274
|
+
'btn',
|
275
|
+
'menu2',
|
276
|
+
'foot_alt',
|
277
|
+
'bannerAd',
|
278
|
+
'tabs',
|
279
|
+
'icomtb',
|
280
|
+
'ContentBorder',
|
281
|
+
'timestamp',
|
282
|
+
'TextAd',
|
283
|
+
'Label',
|
284
|
+
'banner',
|
285
|
+
'navtext',
|
286
|
+
'udm',
|
287
|
+
'pagenav',
|
288
|
+
'style6',
|
289
|
+
'bottomnav',
|
290
|
+
'alt',
|
291
|
+
'nav3',
|
292
|
+
'bot',
|
293
|
+
'narrowcolumn',
|
294
|
+
'clickPath',
|
295
|
+
'formbutt',
|
296
|
+
'lnav',
|
297
|
+
'navcolor',
|
298
|
+
'navMainSections',
|
299
|
+
'sidebarad',
|
300
|
+
'cattitle',
|
301
|
+
'ens',
|
302
|
+
'fivevert',
|
303
|
+
'disclaimer',
|
304
|
+
'disclaimerlink'
|
305
|
+
]
|
306
|
+
|
307
|
+
end
|
308
|
+
|
309
|
+
end
|