content_focus 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.textile ADDED
@@ -0,0 +1,33 @@
1
+
2
+ h1. Content Focus
3
+
4
+ This is a little gem that allows you to input raw HTML and extract the most relevant piece of content. This is useful when doing semantic analysis on HTML pages for example.
5
+
6
+ Right now, ContentFocus only supports 'permanent content extraction'. This is the content that's non-temporal on a page, like for example:
7
+
8
+ * About section
9
+ * Author information
10
+ * Article body
11
+ * Generic information block
12
+
13
+ The algorithm uses several ways of determining this and it will try to neglect irrelevant pieces of content (navigation, styling, etc.)
14
+
15
+ h2. Example
16
+
17
+ <pre><code>
18
+ require 'rubygems'
19
+ require 'content_focus'
20
+
21
+ content_focus = ContentFocus::HTML.new(html_data)
22
+
23
+ # Will return the most relevant content in text
24
+ static_text = content_focus.static_text
25
+
26
+ # Will return the most relevant block of content in a Hpricot HTML tree element
27
+ static_fragment = content_focus.static_fragment
28
+ </code></pre>
29
+
30
+ h2. Author
31
+
32
+ Dominiek ter Heide
33
+ (Note: I wrote this a while back and thought this could be useful to some developers)
@@ -0,0 +1,11 @@
1
+ require 'rubygems'
2
+ require 'content_focus'
3
+
4
+ html_data = File.open(File.join(File.dirname(__FILE__), '../spec/data/twitter_profile.html')).read
5
+ content_focus = ContentFocus::HTML.new(html_data)
6
+
7
+ # Will return the most relevant content in text
8
+ static_text = content_focus.static_text
9
+
10
+ # Will return the most relevant block of content in a Hpricot HTML tree element
11
+ static_fragment = content_focus.static_fragment
@@ -0,0 +1,309 @@
1
+
2
+ module ContentFocus
3
+
4
+ ##
5
+ # Static content fragments are things like: title, about, author, content of an article, etc.
6
+ #
7
+ class HTML
8
+
9
+ def initialize(html)
10
+ @doc = Hpricot(html)
11
+ end
12
+
13
+ def static_text
14
+ fragment = self.static_fragment
15
+ fragment ? fragment[:element].inner_text.strip! : nil
16
+ end
17
+
18
+ # Based on the title, find a common chunk of HTML that is the most relevant
19
+ # This is to extract atomic/permanent content
20
+ def static_fragment(options = {})
21
+ fragments = self.static_fragments(options)
22
+
23
+ return nil if fragments == nil || fragments.empty?
24
+
25
+ if fragments.size == 1
26
+ return fragments.first
27
+ end
28
+
29
+ # Find common ancestors
30
+ fragments_by_parents = {}
31
+ fragments.each do |fragment|
32
+ next unless fragment[:parent]
33
+ fragments_by_parents[fragment[:parent]] ||= []
34
+ fragments_by_parents[fragment[:parent]] << fragment
35
+ end
36
+
37
+ # Find the top parent
38
+ top_fragments = []
39
+ top_parent_fragments_count = 0
40
+ fragments_by_parents.each do |parent,fr|
41
+ if fr.size > top_parent_fragments_count
42
+ top_parent_fragments_count = fr.size
43
+ top_fragments = fr
44
+ end
45
+ end
46
+
47
+ # Failed?
48
+ if !top_fragments || top_fragments.empty?
49
+ return fragments.first
50
+ end
51
+
52
+ # Create a combined fragment with combined score
53
+ element = top_fragments.first[:element]
54
+ combined_fragment = {:score => 0, :element => element.parent, :inner_text => element.parent.inner_text, :parent => element.parent ? element.parent.object_id : nil}
55
+ top_fragments.each { |f| combined_fragment[:score] = combined_fragment[:score] + f[:score] }
56
+
57
+ # De-value the body tag
58
+ if combined_fragment[:element].name == 'body'
59
+ combined_fragment[:score] = top_fragments.size
60
+ end
61
+
62
+ # Add combined fragment to pool and re-order by score.
63
+ fragments << combined_fragment
64
+ fragments.sort! { |b,a| a[:score] <=> b[:score] }
65
+
66
+ #puts fragments.collect { |f| ["#{f[:element].parent ? f[:element].parent.name : nil}:#{f[:parent]}", f[:element].name + '(' + f[:score].to_s + '): ', f[:element].attributes] }.inspect
67
+
68
+ fragments.first
69
+ end
70
+
71
+ # Get all relevant div/span/td/body/p blocks from the HTML page - based on the <title>
72
+ # This is to extract atomic/permanent content
73
+ def static_fragments(options = {})
74
+ title_elements = (@doc/"title")
75
+ return html if !title_elements || title_elements.empty?
76
+ title_inner_text = title_elements.first.inner_text
77
+ keywords = Linguistics::Tagger.keywords_for_caption(title_inner_text)
78
+ blocks = []
79
+
80
+ # First, find the smallest blocks, but bigger than the title
81
+ (@doc/"div|span|td|body|p|dd|ul").each do |element|
82
+
83
+ next if element_with_negative_identifier(element)
84
+
85
+ inner_text = ''
86
+ element.children.each do |child|
87
+ inner_text << child.to_s if child.is_a?(Hpricot::Text)
88
+ end
89
+ inner_text.downcase!
90
+ next if inner_text.size <= title_inner_text.size
91
+
92
+ # Check the occurance of keyword in block, skip if none
93
+ num_matches = 0
94
+ keywords.each { |k| num_matches+=1 if inner_text.split(/\s+/).include?(k) }
95
+ next if num_matches == 0
96
+
97
+ # Calculate a score based on keyword matches times positive naming of id/class
98
+ score = num_matches
99
+ identifier = nil
100
+ if (identifier = element_with_positive_identifier(element))
101
+ score = score * 2;
102
+ end
103
+
104
+ blocks << {:score => score, :element => element, :inner_text => inner_text, :parent => element.parent ? element.parent.object_id : nil, :identifier => identifier}
105
+ end
106
+
107
+ big_block_identifiers = {}
108
+
109
+ # Finding big blocks with both matches and positive identifiers
110
+ (@doc/"div|span|table|td|body|p|dd|ul").each do |element|
111
+
112
+ next if element_with_negative_identifier(element)
113
+
114
+ # Need to log identifier statistics
115
+ identifier = nil
116
+ if (identifier = element_with_positive_identifier(element))
117
+ big_block_identifiers[identifier] ||= 0
118
+ big_block_identifiers[identifier] += 1
119
+ else
120
+ next
121
+ end
122
+
123
+ inner_text = element.inner_text
124
+ inner_text.downcase!
125
+ next if inner_text.size <= title_inner_text.size
126
+
127
+ # Check the occurance of keyword in block, skip if none
128
+ num_matches = 0
129
+ keywords.each { |k| num_matches+=1 if inner_text.split(/\s+/).include?(k) }
130
+
131
+ #puts "#{element.name}(#{element.inner_text.size}/#{title_inner_text.size}, score:#{num_matches} * #{element_with_positive_identifier(element)}): " + element.attributes['class'].to_s
132
+
133
+ next if num_matches == 0
134
+
135
+ # Calculate a score based on keyword matches times positive naming of id/class
136
+ score = num_matches
137
+ if identifier
138
+ score = score * 3;
139
+ end
140
+
141
+ blocks << {:score => score, :element => element, :inner_text => inner_text, :parent => element.parent ? element.parent : nil, :identifier => identifier}
142
+ end
143
+
144
+ # De-value the identifiers that are repeated
145
+ blocks.each do |block|
146
+ if block[:identifier] && big_block_identifiers[block[:identifier]].to_i > 1
147
+ block[:score] = block[:score] / 3;
148
+ end
149
+ end
150
+
151
+ # Order those blocks by top matches
152
+ blocks.sort! { |b,a| a[:score] <=> b[:score] }
153
+ blocks.reject! { |b| b[:score] == 0 }
154
+ blocks
155
+ end
156
+
157
+ private
158
+
159
+ def element_with_positive_identifier(element)
160
+ identifiers_for_element(element).each do |identifier|
161
+ return identifier if POSITIVE_IDENTIFIERS.include?(identifier)
162
+ end
163
+ return false
164
+ end
165
+
166
+ def element_with_negative_identifier(element)
167
+ identifiers_for_element(element).each do |identifier|
168
+ return identifier if NEGATIVE_IDENTIFIERS.include?(identifier)
169
+ end
170
+ return false
171
+ end
172
+
173
+ def identifiers_for_element(element)
174
+ identifiers = []
175
+ identifiers << element.attributes['id'] if element.attributes['id']
176
+ if element.attributes['class']
177
+ klasses = element.attributes['class']
178
+ klasses.split(/\s+/).each { |k| identifiers << k }
179
+ end
180
+ identifiers
181
+ end
182
+
183
+ # From: http://westciv.typepad.com/dog_or_higher/2005/11/real_world_sema.html
184
+ # Thanks go out to http://twitter.com/kimtaro
185
+
186
+ POSITIVE_IDENTIFIERS = [
187
+ 'about',
188
+ 'entry',
189
+ 'description',
190
+ 'bodytext',
191
+ 'post',
192
+ 'author',
193
+ 'caption',
194
+ 'read',
195
+ 'summary',
196
+ 'maintext',
197
+ 'entry-body',
198
+ 'entry-content',
199
+ 'entry-author',
200
+ 'vcard',
201
+ 'article'
202
+ ]
203
+
204
+ NEGATIVE_IDENTIFIERS = [
205
+ 'navigation',
206
+ 'help',
207
+ 'noMargin',
208
+ 'prefill',
209
+ 'button',
210
+ 'Menu',
211
+ 'searchFormSection',
212
+ 'rightAnchor',
213
+ 'seeAllLink',
214
+ 'seeAllBullet',
215
+ 'adSpacer',
216
+ 'nav',
217
+ 'ocDCP',
218
+ 'Date',
219
+ 'CIPpromo',
220
+ 'small',
221
+ 'copyright',
222
+ 'tiny',
223
+ 'link',
224
+ 'search',
225
+ 'links',
226
+ 'topMenu',
227
+ 'left',
228
+ 'more',
229
+ 'smalltext',
230
+ 'prnav',
231
+ 'prred',
232
+ 'logo',
233
+ 'spacer',
234
+ 'MsoNormal',
235
+ 'searchbox',
236
+ 'leftnav',
237
+ 'inputbox',
238
+ 'topnav',
239
+ 'back',
240
+ 'searchinput',
241
+ 'border',
242
+ 'side',
243
+ 'selected',
244
+ 'icons',
245
+ 'helpblk',
246
+ 'ebcPic',
247
+ 'ebPicture',
248
+ 'visual',
249
+ 'topmenu-spacer',
250
+ 'submenu',
251
+ 'input',
252
+ 'navbar',
253
+ 'calendar',
254
+ 'formbut',
255
+ 'breadcrumb',
256
+ 'navlinks',
257
+ 'nwslink',
258
+ 'leftmenu',
259
+ 'rub1',
260
+ 'cbox',
261
+ 'ta-c',
262
+ 'formtext',
263
+ 'mainmenu',
264
+ 'cal',
265
+ 'searchtext',
266
+ 'sidebar',
267
+ 'powered',
268
+ 'imagealign',
269
+ 'ckCol',
270
+ 'binImg',
271
+ 'tm',
272
+ 'searchform',
273
+ 'separator',
274
+ 'btn',
275
+ 'menu2',
276
+ 'foot_alt',
277
+ 'bannerAd',
278
+ 'tabs',
279
+ 'icomtb',
280
+ 'ContentBorder',
281
+ 'timestamp',
282
+ 'TextAd',
283
+ 'Label',
284
+ 'banner',
285
+ 'navtext',
286
+ 'udm',
287
+ 'pagenav',
288
+ 'style6',
289
+ 'bottomnav',
290
+ 'alt',
291
+ 'nav3',
292
+ 'bot',
293
+ 'narrowcolumn',
294
+ 'clickPath',
295
+ 'formbutt',
296
+ 'lnav',
297
+ 'navcolor',
298
+ 'navMainSections',
299
+ 'sidebarad',
300
+ 'cattitle',
301
+ 'ens',
302
+ 'fivevert',
303
+ 'disclaimer',
304
+ 'disclaimerlink'
305
+ ]
306
+
307
+ end
308
+
309
+ end