content_focus 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.textile ADDED
@@ -0,0 +1,33 @@
1
+
2
+ h1. Content Focus
3
+
4
+ This is a little gem that allows you to input raw HTML and extract the most relevant piece of content. This is useful when doing semantic analysis on HTML pages for example.
5
+
6
+ Right now, ContentFocus only supports 'permanent content extraction'. This is the content that's non-temporal on a page, like for example:
7
+
8
+ * About section
9
+ * Author information
10
+ * Article body
11
+ * Generic information block
12
+
13
+ The algorithm uses several ways of determining this and it will try to neglect irrelevant pieces of content (navigation, styling, etc.)
14
+
15
+ h2. Example
16
+
17
+ <pre><code>
18
+ require 'rubygems'
19
+ require 'content_focus'
20
+
21
+ content_focus = ContentFocus::HTML.new(html_data)
22
+
23
+ # Will return the most relevant content in text
24
+ static_text = content_focus.static_text
25
+
26
+ # Will return the most relevant block of content in a Hpricot HTML tree element
27
+ static_fragment = content_focus.static_fragment
28
+ </code></pre>
29
+
30
+ h2. Author
31
+
32
+ Dominiek ter Heide
33
+ (Note: I wrote this a while back and thought this could be useful to some developers)
@@ -0,0 +1,11 @@
1
+ require 'rubygems'
2
+ require 'content_focus'
3
+
4
+ html_data = File.open(File.join(File.dirname(__FILE__), '../spec/data/twitter_profile.html')).read
5
+ content_focus = ContentFocus::HTML.new(html_data)
6
+
7
+ # Will return the most relevant content in text
8
+ static_text = content_focus.static_text
9
+
10
+ # Will return the most relevant block of content in a Hpricot HTML tree element
11
+ static_fragment = content_focus.static_fragment
@@ -0,0 +1,309 @@
1
+
2
+ module ContentFocus
3
+
4
+ ##
5
+ # Static content fragments are things like: title, about, author, content of an article, etc.
6
+ #
7
+ class HTML
8
+
9
+ def initialize(html)
10
+ @doc = Hpricot(html)
11
+ end
12
+
13
+ def static_text
14
+ fragment = self.static_fragment
15
+ fragment ? fragment[:element].inner_text.strip! : nil
16
+ end
17
+
18
+ # Based on the title, find a common chunk of HTML that is the most relevant
19
+ # This is to extract atomic/permanent content
20
+ def static_fragment(options = {})
21
+ fragments = self.static_fragments(options)
22
+
23
+ return nil if fragments == nil || fragments.empty?
24
+
25
+ if fragments.size == 1
26
+ return fragments.first
27
+ end
28
+
29
+ # Find common ancestors
30
+ fragments_by_parents = {}
31
+ fragments.each do |fragment|
32
+ next unless fragment[:parent]
33
+ fragments_by_parents[fragment[:parent]] ||= []
34
+ fragments_by_parents[fragment[:parent]] << fragment
35
+ end
36
+
37
+ # Find the top parent
38
+ top_fragments = []
39
+ top_parent_fragments_count = 0
40
+ fragments_by_parents.each do |parent,fr|
41
+ if fr.size > top_parent_fragments_count
42
+ top_parent_fragments_count = fr.size
43
+ top_fragments = fr
44
+ end
45
+ end
46
+
47
+ # Failed?
48
+ if !top_fragments || top_fragments.empty?
49
+ return fragments.first
50
+ end
51
+
52
+ # Create a combined fragment with combined score
53
+ element = top_fragments.first[:element]
54
+ combined_fragment = {:score => 0, :element => element.parent, :inner_text => element.parent.inner_text, :parent => element.parent ? element.parent.object_id : nil}
55
+ top_fragments.each { |f| combined_fragment[:score] = combined_fragment[:score] + f[:score] }
56
+
57
+ # De-value the body tag
58
+ if combined_fragment[:element].name == 'body'
59
+ combined_fragment[:score] = top_fragments.size
60
+ end
61
+
62
+ # Add combined fragment to pool and re-order by score.
63
+ fragments << combined_fragment
64
+ fragments.sort! { |b,a| a[:score] <=> b[:score] }
65
+
66
+ #puts fragments.collect { |f| ["#{f[:element].parent ? f[:element].parent.name : nil}:#{f[:parent]}", f[:element].name + '(' + f[:score].to_s + '): ', f[:element].attributes] }.inspect
67
+
68
+ fragments.first
69
+ end
70
+
71
+ # Get all relevant div/span/td/body/p blocks from the HTML page - based on the <title>
72
+ # This is to extract atomic/permanent content
73
+ def static_fragments(options = {})
74
+ title_elements = (@doc/"title")
75
+ return html if !title_elements || title_elements.empty?
76
+ title_inner_text = title_elements.first.inner_text
77
+ keywords = Linguistics::Tagger.keywords_for_caption(title_inner_text)
78
+ blocks = []
79
+
80
+ # First, find the smallest blocks, but bigger than the title
81
+ (@doc/"div|span|td|body|p|dd|ul").each do |element|
82
+
83
+ next if element_with_negative_identifier(element)
84
+
85
+ inner_text = ''
86
+ element.children.each do |child|
87
+ inner_text << child.to_s if child.is_a?(Hpricot::Text)
88
+ end
89
+ inner_text.downcase!
90
+ next if inner_text.size <= title_inner_text.size
91
+
92
+ # Check the occurance of keyword in block, skip if none
93
+ num_matches = 0
94
+ keywords.each { |k| num_matches+=1 if inner_text.split(/\s+/).include?(k) }
95
+ next if num_matches == 0
96
+
97
+ # Calculate a score based on keyword matches times positive naming of id/class
98
+ score = num_matches
99
+ identifier = nil
100
+ if (identifier = element_with_positive_identifier(element))
101
+ score = score * 2;
102
+ end
103
+
104
+ blocks << {:score => score, :element => element, :inner_text => inner_text, :parent => element.parent ? element.parent.object_id : nil, :identifier => identifier}
105
+ end
106
+
107
+ big_block_identifiers = {}
108
+
109
+ # Finding big blocks with both matches and positive identifiers
110
+ (@doc/"div|span|table|td|body|p|dd|ul").each do |element|
111
+
112
+ next if element_with_negative_identifier(element)
113
+
114
+ # Need to log identifier statistics
115
+ identifier = nil
116
+ if (identifier = element_with_positive_identifier(element))
117
+ big_block_identifiers[identifier] ||= 0
118
+ big_block_identifiers[identifier] += 1
119
+ else
120
+ next
121
+ end
122
+
123
+ inner_text = element.inner_text
124
+ inner_text.downcase!
125
+ next if inner_text.size <= title_inner_text.size
126
+
127
+ # Check the occurance of keyword in block, skip if none
128
+ num_matches = 0
129
+ keywords.each { |k| num_matches+=1 if inner_text.split(/\s+/).include?(k) }
130
+
131
+ #puts "#{element.name}(#{element.inner_text.size}/#{title_inner_text.size}, score:#{num_matches} * #{element_with_positive_identifier(element)}): " + element.attributes['class'].to_s
132
+
133
+ next if num_matches == 0
134
+
135
+ # Calculate a score based on keyword matches times positive naming of id/class
136
+ score = num_matches
137
+ if identifier
138
+ score = score * 3;
139
+ end
140
+
141
+ blocks << {:score => score, :element => element, :inner_text => inner_text, :parent => element.parent ? element.parent : nil, :identifier => identifier}
142
+ end
143
+
144
+ # De-value the identifiers that are repeated
145
+ blocks.each do |block|
146
+ if block[:identifier] && big_block_identifiers[block[:identifier]].to_i > 1
147
+ block[:score] = block[:score] / 3;
148
+ end
149
+ end
150
+
151
+ # Order those blocks by top matches
152
+ blocks.sort! { |b,a| a[:score] <=> b[:score] }
153
+ blocks.reject! { |b| b[:score] == 0 }
154
+ blocks
155
+ end
156
+
157
+ private
158
+
159
+ def element_with_positive_identifier(element)
160
+ identifiers_for_element(element).each do |identifier|
161
+ return identifier if POSITIVE_IDENTIFIERS.include?(identifier)
162
+ end
163
+ return false
164
+ end
165
+
166
+ def element_with_negative_identifier(element)
167
+ identifiers_for_element(element).each do |identifier|
168
+ return identifier if NEGATIVE_IDENTIFIERS.include?(identifier)
169
+ end
170
+ return false
171
+ end
172
+
173
+ def identifiers_for_element(element)
174
+ identifiers = []
175
+ identifiers << element.attributes['id'] if element.attributes['id']
176
+ if element.attributes['class']
177
+ klasses = element.attributes['class']
178
+ klasses.split(/\s+/).each { |k| identifiers << k }
179
+ end
180
+ identifiers
181
+ end
182
+
183
+ # From: http://westciv.typepad.com/dog_or_higher/2005/11/real_world_sema.html
184
+ # Thanks go out to http://twitter.com/kimtaro
185
+
186
+ POSITIVE_IDENTIFIERS = [
187
+ 'about',
188
+ 'entry',
189
+ 'description',
190
+ 'bodytext',
191
+ 'post',
192
+ 'author',
193
+ 'caption',
194
+ 'read',
195
+ 'summary',
196
+ 'maintext',
197
+ 'entry-body',
198
+ 'entry-content',
199
+ 'entry-author',
200
+ 'vcard',
201
+ 'article'
202
+ ]
203
+
204
+ NEGATIVE_IDENTIFIERS = [
205
+ 'navigation',
206
+ 'help',
207
+ 'noMargin',
208
+ 'prefill',
209
+ 'button',
210
+ 'Menu',
211
+ 'searchFormSection',
212
+ 'rightAnchor',
213
+ 'seeAllLink',
214
+ 'seeAllBullet',
215
+ 'adSpacer',
216
+ 'nav',
217
+ 'ocDCP',
218
+ 'Date',
219
+ 'CIPpromo',
220
+ 'small',
221
+ 'copyright',
222
+ 'tiny',
223
+ 'link',
224
+ 'search',
225
+ 'links',
226
+ 'topMenu',
227
+ 'left',
228
+ 'more',
229
+ 'smalltext',
230
+ 'prnav',
231
+ 'prred',
232
+ 'logo',
233
+ 'spacer',
234
+ 'MsoNormal',
235
+ 'searchbox',
236
+ 'leftnav',
237
+ 'inputbox',
238
+ 'topnav',
239
+ 'back',
240
+ 'searchinput',
241
+ 'border',
242
+ 'side',
243
+ 'selected',
244
+ 'icons',
245
+ 'helpblk',
246
+ 'ebcPic',
247
+ 'ebPicture',
248
+ 'visual',
249
+ 'topmenu-spacer',
250
+ 'submenu',
251
+ 'input',
252
+ 'navbar',
253
+ 'calendar',
254
+ 'formbut',
255
+ 'breadcrumb',
256
+ 'navlinks',
257
+ 'nwslink',
258
+ 'leftmenu',
259
+ 'rub1',
260
+ 'cbox',
261
+ 'ta-c',
262
+ 'formtext',
263
+ 'mainmenu',
264
+ 'cal',
265
+ 'searchtext',
266
+ 'sidebar',
267
+ 'powered',
268
+ 'imagealign',
269
+ 'ckCol',
270
+ 'binImg',
271
+ 'tm',
272
+ 'searchform',
273
+ 'separator',
274
+ 'btn',
275
+ 'menu2',
276
+ 'foot_alt',
277
+ 'bannerAd',
278
+ 'tabs',
279
+ 'icomtb',
280
+ 'ContentBorder',
281
+ 'timestamp',
282
+ 'TextAd',
283
+ 'Label',
284
+ 'banner',
285
+ 'navtext',
286
+ 'udm',
287
+ 'pagenav',
288
+ 'style6',
289
+ 'bottomnav',
290
+ 'alt',
291
+ 'nav3',
292
+ 'bot',
293
+ 'narrowcolumn',
294
+ 'clickPath',
295
+ 'formbutt',
296
+ 'lnav',
297
+ 'navcolor',
298
+ 'navMainSections',
299
+ 'sidebarad',
300
+ 'cattitle',
301
+ 'ens',
302
+ 'fivevert',
303
+ 'disclaimer',
304
+ 'disclaimerlink'
305
+ ]
306
+
307
+ end
308
+
309
+ end