content_focus 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +33 -0
- data/examples/parse_twitter_profile.rb +11 -0
- data/lib/content_focus/html.rb +309 -0
- data/lib/content_focus/lexicon.txt +92662 -0
- data/lib/content_focus/linguistics.rb +147 -0
- data/lib/content_focus.rb +21 -0
- data/spec/content_focus_spec.rb +38 -0
- data/spec/data/confreaks.html +2634 -0
- data/spec/data/google_code_statistics.html +171 -0
- data/spec/data/kakuteru_article.html +199 -0
- data/spec/data/kakuteru_index.html +626 -0
- data/spec/data/movable_type_article.html +1243 -0
- data/spec/data/movable_type_index.html +1503 -0
- data/spec/data/simple_with_navigation.html +24 -0
- data/spec/data/twitter_profile.html +548 -0
- data/spec/data/typad_article.html +1421 -0
- data/spec/data/wordpress_article.html +2004 -0
- data/spec/data/wordpress_custom_article.html +527 -0
- metadata +83 -0
data/README.textile
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
|
|
2
|
+
h1. Content Focus
|
|
3
|
+
|
|
4
|
+
This is a little gem that allows you to input raw HTML and extract the most relevant piece of content. This is useful when doing semantic analysis on HTML pages for example.
|
|
5
|
+
|
|
6
|
+
Right now, ContentFocus only supports 'permanent content extraction'. This is the content that's non-temporal on a page, like for example:
|
|
7
|
+
|
|
8
|
+
* About section
|
|
9
|
+
* Author information
|
|
10
|
+
* Article body
|
|
11
|
+
* Generic information block
|
|
12
|
+
|
|
13
|
+
The algorithm uses several ways of determining this and it will try to neglect irrelevant pieces of content (navigation, styling, etc.)
|
|
14
|
+
|
|
15
|
+
h2. Example
|
|
16
|
+
|
|
17
|
+
<pre><code>
|
|
18
|
+
require 'rubygems'
|
|
19
|
+
require 'content_focus'
|
|
20
|
+
|
|
21
|
+
content_focus = ContentFocus::HTML.new(html_data)
|
|
22
|
+
|
|
23
|
+
# Will return the most relevant content in text
|
|
24
|
+
static_text = content_focus.static_text
|
|
25
|
+
|
|
26
|
+
# Will return the most relevant block of content in a Hpricot HTML tree element
|
|
27
|
+
static_fragment = content_focus.static_fragment
|
|
28
|
+
</code></pre>
|
|
29
|
+
|
|
30
|
+
h2. Author
|
|
31
|
+
|
|
32
|
+
Dominiek ter Heide
|
|
33
|
+
(Note: I wrote this a while back and thought this could be useful to some developers)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
require 'rubygems'
|
|
2
|
+
require 'content_focus'
|
|
3
|
+
|
|
4
|
+
html_data = File.open(File.join(File.dirname(__FILE__), '../spec/data/twitter_profile.html')).read
|
|
5
|
+
content_focus = ContentFocus::HTML.new(html_data)
|
|
6
|
+
|
|
7
|
+
# Will return the most relevant content in text
|
|
8
|
+
static_text = content_focus.static_text
|
|
9
|
+
|
|
10
|
+
# Will return the most relevant block of content in a Hpricot HTML tree element
|
|
11
|
+
static_fragment = content_focus.static_fragment
|
|
@@ -0,0 +1,309 @@
|
|
|
1
|
+
|
|
2
|
+
module ContentFocus
|
|
3
|
+
|
|
4
|
+
##
|
|
5
|
+
# Static content fragments are things like: title, about, author, content of an article, etc.
|
|
6
|
+
#
|
|
7
|
+
class HTML
|
|
8
|
+
|
|
9
|
+
def initialize(html)
|
|
10
|
+
@doc = Hpricot(html)
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def static_text
|
|
14
|
+
fragment = self.static_fragment
|
|
15
|
+
fragment ? fragment[:element].inner_text.strip! : nil
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Based on the title, find a common chunk of HTML that is the most relevant
|
|
19
|
+
# This is to extract atomic/permanent content
|
|
20
|
+
def static_fragment(options = {})
|
|
21
|
+
fragments = self.static_fragments(options)
|
|
22
|
+
|
|
23
|
+
return nil if fragments == nil || fragments.empty?
|
|
24
|
+
|
|
25
|
+
if fragments.size == 1
|
|
26
|
+
return fragments.first
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Find common ancestors
|
|
30
|
+
fragments_by_parents = {}
|
|
31
|
+
fragments.each do |fragment|
|
|
32
|
+
next unless fragment[:parent]
|
|
33
|
+
fragments_by_parents[fragment[:parent]] ||= []
|
|
34
|
+
fragments_by_parents[fragment[:parent]] << fragment
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Find the top parent
|
|
38
|
+
top_fragments = []
|
|
39
|
+
top_parent_fragments_count = 0
|
|
40
|
+
fragments_by_parents.each do |parent,fr|
|
|
41
|
+
if fr.size > top_parent_fragments_count
|
|
42
|
+
top_parent_fragments_count = fr.size
|
|
43
|
+
top_fragments = fr
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Failed?
|
|
48
|
+
if !top_fragments || top_fragments.empty?
|
|
49
|
+
return fragments.first
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Create a combined fragment with combined score
|
|
53
|
+
element = top_fragments.first[:element]
|
|
54
|
+
combined_fragment = {:score => 0, :element => element.parent, :inner_text => element.parent.inner_text, :parent => element.parent ? element.parent.object_id : nil}
|
|
55
|
+
top_fragments.each { |f| combined_fragment[:score] = combined_fragment[:score] + f[:score] }
|
|
56
|
+
|
|
57
|
+
# De-value the body tag
|
|
58
|
+
if combined_fragment[:element].name == 'body'
|
|
59
|
+
combined_fragment[:score] = top_fragments.size
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Add combined fragment to pool and re-order by score.
|
|
63
|
+
fragments << combined_fragment
|
|
64
|
+
fragments.sort! { |b,a| a[:score] <=> b[:score] }
|
|
65
|
+
|
|
66
|
+
#puts fragments.collect { |f| ["#{f[:element].parent ? f[:element].parent.name : nil}:#{f[:parent]}", f[:element].name + '(' + f[:score].to_s + '): ', f[:element].attributes] }.inspect
|
|
67
|
+
|
|
68
|
+
fragments.first
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Get all relevant div/span/td/body/p blocks from the HTML page - based on the <title>
|
|
72
|
+
# This is to extract atomic/permanent content
|
|
73
|
+
def static_fragments(options = {})
|
|
74
|
+
title_elements = (@doc/"title")
|
|
75
|
+
return html if !title_elements || title_elements.empty?
|
|
76
|
+
title_inner_text = title_elements.first.inner_text
|
|
77
|
+
keywords = Linguistics::Tagger.keywords_for_caption(title_inner_text)
|
|
78
|
+
blocks = []
|
|
79
|
+
|
|
80
|
+
# First, find the smallest blocks, but bigger than the title
|
|
81
|
+
(@doc/"div|span|td|body|p|dd|ul").each do |element|
|
|
82
|
+
|
|
83
|
+
next if element_with_negative_identifier(element)
|
|
84
|
+
|
|
85
|
+
inner_text = ''
|
|
86
|
+
element.children.each do |child|
|
|
87
|
+
inner_text << child.to_s if child.is_a?(Hpricot::Text)
|
|
88
|
+
end
|
|
89
|
+
inner_text.downcase!
|
|
90
|
+
next if inner_text.size <= title_inner_text.size
|
|
91
|
+
|
|
92
|
+
# Check the occurance of keyword in block, skip if none
|
|
93
|
+
num_matches = 0
|
|
94
|
+
keywords.each { |k| num_matches+=1 if inner_text.split(/\s+/).include?(k) }
|
|
95
|
+
next if num_matches == 0
|
|
96
|
+
|
|
97
|
+
# Calculate a score based on keyword matches times positive naming of id/class
|
|
98
|
+
score = num_matches
|
|
99
|
+
identifier = nil
|
|
100
|
+
if (identifier = element_with_positive_identifier(element))
|
|
101
|
+
score = score * 2;
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
blocks << {:score => score, :element => element, :inner_text => inner_text, :parent => element.parent ? element.parent.object_id : nil, :identifier => identifier}
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
big_block_identifiers = {}
|
|
108
|
+
|
|
109
|
+
# Finding big blocks with both matches and positive identifiers
|
|
110
|
+
(@doc/"div|span|table|td|body|p|dd|ul").each do |element|
|
|
111
|
+
|
|
112
|
+
next if element_with_negative_identifier(element)
|
|
113
|
+
|
|
114
|
+
# Need to log identifier statistics
|
|
115
|
+
identifier = nil
|
|
116
|
+
if (identifier = element_with_positive_identifier(element))
|
|
117
|
+
big_block_identifiers[identifier] ||= 0
|
|
118
|
+
big_block_identifiers[identifier] += 1
|
|
119
|
+
else
|
|
120
|
+
next
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
inner_text = element.inner_text
|
|
124
|
+
inner_text.downcase!
|
|
125
|
+
next if inner_text.size <= title_inner_text.size
|
|
126
|
+
|
|
127
|
+
# Check the occurance of keyword in block, skip if none
|
|
128
|
+
num_matches = 0
|
|
129
|
+
keywords.each { |k| num_matches+=1 if inner_text.split(/\s+/).include?(k) }
|
|
130
|
+
|
|
131
|
+
#puts "#{element.name}(#{element.inner_text.size}/#{title_inner_text.size}, score:#{num_matches} * #{element_with_positive_identifier(element)}): " + element.attributes['class'].to_s
|
|
132
|
+
|
|
133
|
+
next if num_matches == 0
|
|
134
|
+
|
|
135
|
+
# Calculate a score based on keyword matches times positive naming of id/class
|
|
136
|
+
score = num_matches
|
|
137
|
+
if identifier
|
|
138
|
+
score = score * 3;
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
blocks << {:score => score, :element => element, :inner_text => inner_text, :parent => element.parent ? element.parent : nil, :identifier => identifier}
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# De-value the identifiers that are repeated
|
|
145
|
+
blocks.each do |block|
|
|
146
|
+
if block[:identifier] && big_block_identifiers[block[:identifier]].to_i > 1
|
|
147
|
+
block[:score] = block[:score] / 3;
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# Order those blocks by top matches
|
|
152
|
+
blocks.sort! { |b,a| a[:score] <=> b[:score] }
|
|
153
|
+
blocks.reject! { |b| b[:score] == 0 }
|
|
154
|
+
blocks
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
private
|
|
158
|
+
|
|
159
|
+
def element_with_positive_identifier(element)
|
|
160
|
+
identifiers_for_element(element).each do |identifier|
|
|
161
|
+
return identifier if POSITIVE_IDENTIFIERS.include?(identifier)
|
|
162
|
+
end
|
|
163
|
+
return false
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
def element_with_negative_identifier(element)
|
|
167
|
+
identifiers_for_element(element).each do |identifier|
|
|
168
|
+
return identifier if NEGATIVE_IDENTIFIERS.include?(identifier)
|
|
169
|
+
end
|
|
170
|
+
return false
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
def identifiers_for_element(element)
|
|
174
|
+
identifiers = []
|
|
175
|
+
identifiers << element.attributes['id'] if element.attributes['id']
|
|
176
|
+
if element.attributes['class']
|
|
177
|
+
klasses = element.attributes['class']
|
|
178
|
+
klasses.split(/\s+/).each { |k| identifiers << k }
|
|
179
|
+
end
|
|
180
|
+
identifiers
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
# From: http://westciv.typepad.com/dog_or_higher/2005/11/real_world_sema.html
|
|
184
|
+
# Thanks go out to http://twitter.com/kimtaro
|
|
185
|
+
|
|
186
|
+
POSITIVE_IDENTIFIERS = [
|
|
187
|
+
'about',
|
|
188
|
+
'entry',
|
|
189
|
+
'description',
|
|
190
|
+
'bodytext',
|
|
191
|
+
'post',
|
|
192
|
+
'author',
|
|
193
|
+
'caption',
|
|
194
|
+
'read',
|
|
195
|
+
'summary',
|
|
196
|
+
'maintext',
|
|
197
|
+
'entry-body',
|
|
198
|
+
'entry-content',
|
|
199
|
+
'entry-author',
|
|
200
|
+
'vcard',
|
|
201
|
+
'article'
|
|
202
|
+
]
|
|
203
|
+
|
|
204
|
+
NEGATIVE_IDENTIFIERS = [
|
|
205
|
+
'navigation',
|
|
206
|
+
'help',
|
|
207
|
+
'noMargin',
|
|
208
|
+
'prefill',
|
|
209
|
+
'button',
|
|
210
|
+
'Menu',
|
|
211
|
+
'searchFormSection',
|
|
212
|
+
'rightAnchor',
|
|
213
|
+
'seeAllLink',
|
|
214
|
+
'seeAllBullet',
|
|
215
|
+
'adSpacer',
|
|
216
|
+
'nav',
|
|
217
|
+
'ocDCP',
|
|
218
|
+
'Date',
|
|
219
|
+
'CIPpromo',
|
|
220
|
+
'small',
|
|
221
|
+
'copyright',
|
|
222
|
+
'tiny',
|
|
223
|
+
'link',
|
|
224
|
+
'search',
|
|
225
|
+
'links',
|
|
226
|
+
'topMenu',
|
|
227
|
+
'left',
|
|
228
|
+
'more',
|
|
229
|
+
'smalltext',
|
|
230
|
+
'prnav',
|
|
231
|
+
'prred',
|
|
232
|
+
'logo',
|
|
233
|
+
'spacer',
|
|
234
|
+
'MsoNormal',
|
|
235
|
+
'searchbox',
|
|
236
|
+
'leftnav',
|
|
237
|
+
'inputbox',
|
|
238
|
+
'topnav',
|
|
239
|
+
'back',
|
|
240
|
+
'searchinput',
|
|
241
|
+
'border',
|
|
242
|
+
'side',
|
|
243
|
+
'selected',
|
|
244
|
+
'icons',
|
|
245
|
+
'helpblk',
|
|
246
|
+
'ebcPic',
|
|
247
|
+
'ebPicture',
|
|
248
|
+
'visual',
|
|
249
|
+
'topmenu-spacer',
|
|
250
|
+
'submenu',
|
|
251
|
+
'input',
|
|
252
|
+
'navbar',
|
|
253
|
+
'calendar',
|
|
254
|
+
'formbut',
|
|
255
|
+
'breadcrumb',
|
|
256
|
+
'navlinks',
|
|
257
|
+
'nwslink',
|
|
258
|
+
'leftmenu',
|
|
259
|
+
'rub1',
|
|
260
|
+
'cbox',
|
|
261
|
+
'ta-c',
|
|
262
|
+
'formtext',
|
|
263
|
+
'mainmenu',
|
|
264
|
+
'cal',
|
|
265
|
+
'searchtext',
|
|
266
|
+
'sidebar',
|
|
267
|
+
'powered',
|
|
268
|
+
'imagealign',
|
|
269
|
+
'ckCol',
|
|
270
|
+
'binImg',
|
|
271
|
+
'tm',
|
|
272
|
+
'searchform',
|
|
273
|
+
'separator',
|
|
274
|
+
'btn',
|
|
275
|
+
'menu2',
|
|
276
|
+
'foot_alt',
|
|
277
|
+
'bannerAd',
|
|
278
|
+
'tabs',
|
|
279
|
+
'icomtb',
|
|
280
|
+
'ContentBorder',
|
|
281
|
+
'timestamp',
|
|
282
|
+
'TextAd',
|
|
283
|
+
'Label',
|
|
284
|
+
'banner',
|
|
285
|
+
'navtext',
|
|
286
|
+
'udm',
|
|
287
|
+
'pagenav',
|
|
288
|
+
'style6',
|
|
289
|
+
'bottomnav',
|
|
290
|
+
'alt',
|
|
291
|
+
'nav3',
|
|
292
|
+
'bot',
|
|
293
|
+
'narrowcolumn',
|
|
294
|
+
'clickPath',
|
|
295
|
+
'formbutt',
|
|
296
|
+
'lnav',
|
|
297
|
+
'navcolor',
|
|
298
|
+
'navMainSections',
|
|
299
|
+
'sidebarad',
|
|
300
|
+
'cattitle',
|
|
301
|
+
'ens',
|
|
302
|
+
'fivevert',
|
|
303
|
+
'disclaimer',
|
|
304
|
+
'disclaimerlink'
|
|
305
|
+
]
|
|
306
|
+
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
end
|