pismo 0.5.0 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +19 -28
- data/NOTICE +4 -0
- data/README.markdown +37 -40
- data/Rakefile +3 -2
- data/VERSION +1 -1
- data/bin/pismo +15 -7
- data/lib/pismo/document.rb +2 -2
- data/lib/pismo/internal_attributes.rb +23 -16
- data/lib/pismo/reader.rb +390 -0
- data/lib/pismo.rb +3 -2
- data/pismo.gemspec +23 -15
- data/test/corpus/bbcnews2.html +1575 -0
- data/test/corpus/gmane.html +138 -0
- data/test/corpus/metadata_expected.yaml +20 -5
- data/test/corpus/queness.html +919 -0
- data/test/corpus/reader_expected.yaml +45 -0
- data/test/corpus/tweet.html +360 -0
- data/test/corpus/zefrank.html +535 -0
- data/test/test_corpus.rb +9 -1
- metadata +89 -34
- data/lib/pismo/readability.rb +0 -342
- data/test/test_readability.rb +0 -152
@@ -0,0 +1,45 @@
|
|
1
|
+
---
|
2
|
+
:bbcnews:
|
3
|
+
- "A UK charity is dealing with an increasing number of young gay Muslims becoming homeless after fleeing forced marriages and so-called honour violence."
|
4
|
+
- "During a weekly drop-in group held by the Albert Kennedy Trust in London, Suni, a 20-year-old London student, helps himself to a warm mince pie and a steaming cup of coffee."
|
5
|
+
:bbcnews2:
|
6
|
+
- "The government in England is setting out how parents, teachers, charities and other groups can apply to set up their own \"free school\"."
|
7
|
+
- "Free schools will be independent but funded by central government."
|
8
|
+
:briancray:
|
9
|
+
- "This is a mock post."
|
10
|
+
- "While there a place for all of these posts, I'm trying to make a point that original blogs are being shut out by formulaic blogs."
|
11
|
+
:cant_read:
|
12
|
+
- "For those of us who grew up as weird kids in the 1980s, the work of Berkeley Breathed was as important as those twin eternal pillars of weird-kid-dom: Monty Python and Mad magazine."
|
13
|
+
- "In a word: seminal."
|
14
|
+
:factor:
|
15
|
+
- "The Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks."
|
16
|
+
- "The VM loads an image file on startup, which becomes the data and code heap."
|
17
|
+
:gmane:
|
18
|
+
- "I am pleased to report that the GCC Steering Committee and the FSF have approved the use of C++ in GCC itself."
|
19
|
+
- "Of course, there's no reason for us to use C++ features just because we can."
|
20
|
+
:huffington:
|
21
|
+
- "The man on the motorcycle was going the wrong way down a one-way street, gesturing indignantly for the phalanx of traffic-clogged cars in front of him to move."
|
22
|
+
- "\"Brother, why are you angry with us?\" said a passenger leaning out of one of the vehicles blocking his path."
|
23
|
+
:queness:
|
24
|
+
- "CSS3 is hot these days and will soon be available in most modern browser."
|
25
|
+
- "Just recently, I started to become aware to the present of CSS3 around the web."
|
26
|
+
:rubyinside:
|
27
|
+
- "CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler."
|
28
|
+
- "Creator Jeremy Ashkenas calls it \"JavaScript's less ostentatious kid brother\" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax."
|
29
|
+
:rww:
|
30
|
+
- "I'm just aching to know if the new Apple tablet (insert caveats, weasel words and qualifiers here) is a potential Cintiq competitor."
|
31
|
+
- "I don't think it will be, but you never know."
|
32
|
+
:spolsky:
|
33
|
+
- "I've been dismayed to discover just how many software developers aren't really completely up to speed on the mysterious world of character sets, encodings, Unicode, all that stuff."
|
34
|
+
- "A couple of years ago, a beta tester for FogBUGZ was wondering whether it could handle incoming email in Japanese."
|
35
|
+
:techcrunch:
|
36
|
+
- "Last week, we covered Googlle opening a school in India."
|
37
|
+
- "Googlle, not to be confused with Google."
|
38
|
+
:tweet:
|
39
|
+
- "Gobsmacked that TeX/LaTeX (document formatting tools) for OS X is a 1.3GB (yes, GIGAbytes) download OS Wow..!"
|
40
|
+
:youtube:
|
41
|
+
- "The location filter shows you popular videos from the selected country or region on lists like Most Viewed and in search results.If you would like to change either of these preferences, please use the links in the footer at the bottom of the page."
|
42
|
+
- "Click \"OK\" to accept these settings or click \"Cancel\" to set your language preference to \"English (UK)\" and your location filter to \"Worldwide\"."
|
43
|
+
:zefrank:
|
44
|
+
- "If there's anyone who knows how to marshal an online audience, it's Ze Frank."
|
45
|
+
- "Ze is best-known for his 2006 program \"The Show,\" in which he made a new 2-3 minute video every day for 1 year."
|
@@ -0,0 +1,360 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
2
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
3
|
+
<head>
|
4
|
+
<meta http-equiv="X-UA-Compatible" content="IE=8">
|
5
|
+
<script type="text/javascript">
|
6
|
+
//<![CDATA[
|
7
|
+
var page={};var onCondition=function(D,C,A,B){D=D;A=A?Math.min(A,5):5;B=B||100;if(D()){C()}else{if(A>1){setTimeout(function(){onCondition(D,C,A-1,B)},B)}}};
|
8
|
+
//]]>
|
9
|
+
</script>
|
10
|
+
<meta content="text/html; charset=utf-8" http-equiv="Content-Type" />
|
11
|
+
<meta content="en-us" http-equiv="Content-Language" />
|
12
|
+
<meta content="Gobsmacked that TeX/LaTeX (document formatting tools) for OS X is a 1.3GB (yes, GIGAbytes) download OS X. Wow..!" name="description" />
|
13
|
+
<meta content="no" http-equiv="imagetoolbar" />
|
14
|
+
<meta content="width = 780" name="viewport" />
|
15
|
+
<meta content="4FTTxY4uvo0RZTMQqIyhh18HsepyJOctQ+XTOu1zsfE=" name="verify-v1" />
|
16
|
+
<meta content="1" name="page" />
|
17
|
+
<meta content="NOODP" name="robots" />
|
18
|
+
<meta content="n" name="session-loggedin" />
|
19
|
+
<meta content="peterc" name="page-user-screen_name" />
|
20
|
+
<title id="page_title">Twitter / Peter Çoopèr: Gobsmacked that TeX/LaTeX ...</title>
|
21
|
+
<link href="http://a1.twimg.com/a/1276197224/images/favicon.ico" rel="shortcut icon" type="image/x-icon" />
|
22
|
+
<link href="http://a1.twimg.com/a/1276197224/images/twitter_57.png" rel="apple-touch-icon" />
|
23
|
+
|
24
|
+
<link href="http://a2.twimg.com/a/1276197224/stylesheets/twitter.css?1276316974" media="screen" rel="stylesheet" type="text/css" />
|
25
|
+
<link href="http://a2.twimg.com/a/1276197224/stylesheets/geo.css?1276316974" media="screen" rel="stylesheet" type="text/css" />
|
26
|
+
<link href="http://a2.twimg.com/a/1276197224/stylesheets/buttons_new.css?1276316974" media="screen" rel="stylesheet" type="text/css" />
|
27
|
+
<style type="text/css">
|
28
|
+
|
29
|
+
body {
|
30
|
+
background: #ECDFDE url('http://a1.twimg.com/profile_background_images/99434952/twitterbgnew.jpg') fixed no-repeat;
|
31
|
+
|
32
|
+
}
|
33
|
+
|
34
|
+
|
35
|
+
/* Link Color */
|
36
|
+
a,
|
37
|
+
#content tr.hentry:hover a,
|
38
|
+
body#profile #content div.hentry:hover a,
|
39
|
+
#side .stats a:hover span.stats_count,
|
40
|
+
#side div.user_icon a:hover,
|
41
|
+
li.verified-profile a:hover,
|
42
|
+
#side .promotion .definition strong,
|
43
|
+
p.list-numbers a:hover,
|
44
|
+
#side div.user_icon a:hover span,
|
45
|
+
#content .tabMenu li a,
|
46
|
+
.translator-profile a:hover,
|
47
|
+
#local_trend_locations li a,
|
48
|
+
.modal-content .list-slug,
|
49
|
+
.tweet-label a:hover,
|
50
|
+
ol.statuses li.garuda-tweet:hover .actions-hover li span a,
|
51
|
+
ol.statuses li.garuda-tweet .actions-hover li span a:hover {
|
52
|
+
color: #007BE6;
|
53
|
+
}
|
54
|
+
|
55
|
+
body,
|
56
|
+
ul#tabMenu li a, #side .section h1,
|
57
|
+
#side .stat a,
|
58
|
+
#side .stats a span.stats_count,
|
59
|
+
#side div.section-header h1,
|
60
|
+
#side div.user_icon a,
|
61
|
+
#side div.user_icon a:hover,
|
62
|
+
#side div.section-header h3.faq-header,
|
63
|
+
ul.sidebar-menu li.active a,
|
64
|
+
li.verified-profile a,
|
65
|
+
#side .promotion a,
|
66
|
+
body #content .list-header h2,
|
67
|
+
p.list-numbers a,
|
68
|
+
.bar h3 label,
|
69
|
+
body.timeline #content h1,
|
70
|
+
.list-header h2 a span,
|
71
|
+
#content .tabMenu li.active a,
|
72
|
+
body#direct_messages #content .tabMenu #inbox_tab a,
|
73
|
+
body#inbox #content .tabMenu #inbox_tab a,
|
74
|
+
body#sent #content .tabMenu #sent_tab a,
|
75
|
+
body#direct_messages #content .tabMenu #inbox_tab a,
|
76
|
+
body#retweets_by_others #content .tabMenu #retweets_by_others_tab a,
|
77
|
+
body#retweets #content .tabMenu #retweets_tab a,
|
78
|
+
body#retweeted_by_others #content .tabMenu #retweeted_by_others_tab a,
|
79
|
+
body#retweeted_of_mine #content .tabMenu #retweeted_of_mine_tab a,
|
80
|
+
.translator-profile a,
|
81
|
+
#owners_lists h2 a {
|
82
|
+
color: #002533;
|
83
|
+
}
|
84
|
+
|
85
|
+
.unconfirmed-email-banner {
|
86
|
+
border-bottom: solid 1px #FFFFFF;
|
87
|
+
}
|
88
|
+
#side_base {
|
89
|
+
border-left:1px solid #FFFFFF;
|
90
|
+
background-color: #FFFFFF;
|
91
|
+
}
|
92
|
+
|
93
|
+
ul.sidebar-menu li.active a,
|
94
|
+
ul.sidebar-menu li a:hover,
|
95
|
+
#side div#custom_search.active,
|
96
|
+
#side .promotion,
|
97
|
+
.notify div {
|
98
|
+
background-color: #F7F7F7;
|
99
|
+
}
|
100
|
+
|
101
|
+
.list-header,
|
102
|
+
.list-controls,
|
103
|
+
ul.sidebar-list li.active a,
|
104
|
+
ul.sidebar-list li a:hover,
|
105
|
+
.list-header-inner {
|
106
|
+
background-color: #FFFFFF !important;
|
107
|
+
}
|
108
|
+
|
109
|
+
#side .actions,
|
110
|
+
#side .promo,
|
111
|
+
#design .side-section {
|
112
|
+
border: 1px solid #FFFFFF;
|
113
|
+
}
|
114
|
+
|
115
|
+
#side div.section-header h3 {
|
116
|
+
border-bottom: 1px solid #FFFFFF;
|
117
|
+
}
|
118
|
+
|
119
|
+
#side p.sidebar-location {
|
120
|
+
border-bottom: 1px dotted #FFFFFF;
|
121
|
+
}
|
122
|
+
|
123
|
+
#side hr {
|
124
|
+
background: #FFFFFF;
|
125
|
+
color: #FFFFFF;
|
126
|
+
}
|
127
|
+
|
128
|
+
ul.sidebar-menu li.loading a {
|
129
|
+
background: #F7F7F7 url('http://a1.twimg.com/a/1276197224/images/spinner.gif') no-repeat 171px 0.5em !important;
|
130
|
+
}
|
131
|
+
|
132
|
+
#side .collapsible h2.sidebar-title {
|
133
|
+
background: transparent url('http://a2.twimg.com/a/1276197224/images/toggle_up_dark.png') no-repeat center right !important;
|
134
|
+
}
|
135
|
+
|
136
|
+
#side .collapsible.collapsed h2.sidebar-title {
|
137
|
+
background: transparent url('http://a1.twimg.com/a/1276197224/images/toggle_down_dark.png') no-repeat center right !important;
|
138
|
+
}
|
139
|
+
|
140
|
+
#side ul.lists-links li a em {
|
141
|
+
background: url('http://a3.twimg.com/a/1276197224/images/arrow_right_dark.png') no-repeat left top;
|
142
|
+
}
|
143
|
+
|
144
|
+
#side span.pipe {
|
145
|
+
border-left:1px solid #FFFFFF;
|
146
|
+
}
|
147
|
+
|
148
|
+
#list_subscriptions span.view-all,
|
149
|
+
#list_memberships span.view-all,
|
150
|
+
#profile span.view-all,
|
151
|
+
#profile_favorites span.view-all,
|
152
|
+
#following span.view-all,
|
153
|
+
#followers span.view-all {
|
154
|
+
border-left: 0;
|
155
|
+
}
|
156
|
+
|
157
|
+
a.edit-list {
|
158
|
+
border-right: 1px solid #FFFFFF !important;
|
159
|
+
}
|
160
|
+
|
161
|
+
|
162
|
+
|
163
|
+
</style>
|
164
|
+
|
165
|
+
</head>
|
166
|
+
|
167
|
+
<body class="status" id="show">
|
168
|
+
<script type="text/javascript">
|
169
|
+
//<![CDATA[
|
170
|
+
if (window.top !== window.self) {document.write = "";window.top.location = window.self.location; setTimeout(function(){document.body.innerHTML='';},1);window.self.onload=function(evt){document.body.innerHTML='';};}
|
171
|
+
//]]>
|
172
|
+
</script>
|
173
|
+
|
174
|
+
|
175
|
+
<div id="dim-screen"></div>
|
176
|
+
<ul id="accessibility" class="offscreen">
|
177
|
+
<li><a href="#content" accesskey="0">Skip past navigation</a></li>
|
178
|
+
<li>On a mobile phone? Check out <a href="http://m.twitter.com/">m.twitter.com</a>!</li>
|
179
|
+
<li><a href="#footer" accesskey="2">Skip to navigation</a></li>
|
180
|
+
<li><a href="#signin">Skip to sign in form</a></li>
|
181
|
+
</ul>
|
182
|
+
|
183
|
+
|
184
|
+
|
185
|
+
|
186
|
+
<div id="container">
|
187
|
+
<span id="loader" style="display:none"><img alt="Loader" src="http://a0.twimg.com/a/1276197224/images/loader.gif" /></span>
|
188
|
+
<div class="clearfix" id="header">
|
189
|
+
<a href="http://twitter.com/" title="Twitter / Home" accesskey="1" id="logo">
|
190
|
+
<img alt="Twitter.com" height="36" src="http://a0.twimg.com/a/1276197224/images/twitter_logo_header.png" width="155" />
|
191
|
+
</a>
|
192
|
+
<form method="post" id="sign_out_form" action="/sessions/destroy" style="display:none;">
|
193
|
+
<input name="authenticity_token" value="f6d379112383ae1aaa5991cc4adb776570eeaae7" type="hidden"/>
|
194
|
+
</form>
|
195
|
+
|
196
|
+
|
197
|
+
<ul class="top-navigation round">
|
198
|
+
<li><a href="/login" accesskey="l">Login</a></li>
|
199
|
+
<li class="signup-link"><a href="/signup">Join Twitter!</a></li>
|
200
|
+
</ul>
|
201
|
+
|
202
|
+
|
203
|
+
</div>
|
204
|
+
|
205
|
+
|
206
|
+
|
207
|
+
|
208
|
+
|
209
|
+
<div class="content-bubble-arrow"></div>
|
210
|
+
|
211
|
+
|
212
|
+
|
213
|
+
<div id="content" class="wide">
|
214
|
+
<div class="wrapper">
|
215
|
+
<div id="permalink" class="status">
|
216
|
+
|
217
|
+
|
218
|
+
<div class="hentry u-peterc status" id="status_15518147137"
|
219
|
+
>
|
220
|
+
<span class="status-body">
|
221
|
+
<span class="status-content">
|
222
|
+
<span class="entry-content">Gobsmacked that TeX/LaTeX (document formatting tools) for OS X is a 1.3GB (yes, GIGAbytes) download OS X. Wow..!</span>
|
223
|
+
</span>
|
224
|
+
<span class="meta entry-meta" data='{}'>
|
225
|
+
<a class="entry-date" rel="bookmark" href="http://twitter.com/peterc/status/15518147137">
|
226
|
+
<span class="published timestamp" data="{time:'Sat Jun 05 23:36:18 +0000 2010'}">4:36 PM Jun 5th</span></a>
|
227
|
+
<span>via web</span>
|
228
|
+
|
229
|
+
</span>
|
230
|
+
|
231
|
+
<ul class="meta-data clearfix">
|
232
|
+
|
233
|
+
</ul>
|
234
|
+
</span>
|
235
|
+
</div>
|
236
|
+
|
237
|
+
|
238
|
+
<div class="user-info clear">
|
239
|
+
<div class="thumb"><a href="http://twitter.com/peterc" class="tweet-url profile-pic" hreflang="en"><img alt="" border="0" height="73" src="http://a3.twimg.com/profile_images/881957953/new-suit-one-22_bigger.png" style="vertical-align:middle" width="73" /></a></div>
|
240
|
+
<div class=""><a href="http://twitter.com/peterc" class="tweet-url screen-name" hreflang="en" title="Peter Çoopèr">peterc</a></div>
|
241
|
+
<div class="full-name">Peter Çoopèr</div>
|
242
|
+
</div>
|
243
|
+
|
244
|
+
|
245
|
+
|
246
|
+
</div>
|
247
|
+
|
248
|
+
|
249
|
+
</div>
|
250
|
+
</div>
|
251
|
+
|
252
|
+
|
253
|
+
|
254
|
+
<div id="footer" class="round wide">
|
255
|
+
<h3 class="offscreen">Footer</h3>
|
256
|
+
|
257
|
+
|
258
|
+
<ul class="footer-nav">
|
259
|
+
<li class="first">© 2010 Twitter</li>
|
260
|
+
<li><a href="/about">About Us</a></li>
|
261
|
+
<li><a href="/about/contact">Contact</a></li>
|
262
|
+
<li><a href="http://blog.twitter.com">Blog</a></li>
|
263
|
+
<li><a href="http://status.twitter.com">Status</a></li>
|
264
|
+
<li><a href="/goodies">Goodies</a></li>
|
265
|
+
<li><a href="http://dev.twitter.com/">API</a></li>
|
266
|
+
<li><a href="http://business.twitter.com/twitter101">Business</a></li>
|
267
|
+
<li><a href="http://support.twitter.com">Help</a></li>
|
268
|
+
<li><a href="/jobs">Jobs</a></li>
|
269
|
+
<li><a href="/tos">Terms</a></li>
|
270
|
+
<li><a href="/privacy">Privacy</a></li>
|
271
|
+
</ul>
|
272
|
+
</div>
|
273
|
+
|
274
|
+
|
275
|
+
|
276
|
+
<hr />
|
277
|
+
|
278
|
+
</div>
|
279
|
+
|
280
|
+
|
281
|
+
|
282
|
+
<script src="http://ajax.googleapis.com/ajax/libs/jquery/1.3.0/jquery.min.js" type="text/javascript"></script>
|
283
|
+
<script src="http://a1.twimg.com/a/1276197224/javascripts/twitter.js?1276316974" type="text/javascript"></script>
|
284
|
+
<script src="http://a3.twimg.com/a/1276197224/javascripts/lib/jquery.tipsy.min.js?1276316974" type="text/javascript"></script>
|
285
|
+
<script type="text/javascript" src='http://maps.google.com/maps/api/js?sensor=false'></script>
|
286
|
+
<script src="http://a3.twimg.com/a/1276197224/javascripts/lib/gears_init.js?1276316974" type="text/javascript"></script>
|
287
|
+
<script src="http://a1.twimg.com/a/1276197224/javascripts/geov1.js?1276316974" type="text/javascript"></script>
|
288
|
+
<script src="http://a3.twimg.com/a/1276197224/javascripts/api.js?1276316974" type="text/javascript"></script>
|
289
|
+
<script type="text/javascript">
|
290
|
+
//<![CDATA[
|
291
|
+
$.cookie('tz_offset_sec', (-1 * (new Date()).getTimezoneOffset())*60);
|
292
|
+
//]]>
|
293
|
+
</script>
|
294
|
+
<script type="text/javascript">
|
295
|
+
//<![CDATA[
|
296
|
+
|
297
|
+
|
298
|
+
$(document).ready(function() {
|
299
|
+
$.Statuses.initialize($('#permalink'));
|
300
|
+
});
|
301
|
+
page.controller_name = 'StatusController';
|
302
|
+
page.action_name = 'show';
|
303
|
+
twttr.form_authenticity_token = 'f6d379112383ae1aaa5991cc4adb776570eeaae7';
|
304
|
+
$.ajaxSetup({ data: { authenticity_token: 'f6d379112383ae1aaa5991cc4adb776570eeaae7' } });
|
305
|
+
|
306
|
+
// FIXME: Reconcile with the kinds on the Status model.
|
307
|
+
twttr.statusKinds = {
|
308
|
+
UPDATE: 1,
|
309
|
+
SHARE: 2
|
310
|
+
};
|
311
|
+
twttr.ListPerUserLimit = 20;
|
312
|
+
|
313
|
+
|
314
|
+
|
315
|
+
|
316
|
+
//]]>
|
317
|
+
</script>
|
318
|
+
|
319
|
+
<!-- BEGIN google analytics -->
|
320
|
+
|
321
|
+
<script type="text/javascript">
|
322
|
+
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
|
323
|
+
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
|
324
|
+
</script>
|
325
|
+
|
326
|
+
<script type="text/javascript">
|
327
|
+
|
328
|
+
try {
|
329
|
+
var pageTracker = _gat._getTracker("UA-30775-6");
|
330
|
+
pageTracker._setDomainName("twitter.com");
|
331
|
+
pageTracker._setVar('Not Logged In');
|
332
|
+
pageTracker._setVar('lang: en');
|
333
|
+
pageTracker._initData();
|
334
|
+
|
335
|
+
pageTracker._trackPageview('/statuses/peterc/15518147137');
|
336
|
+
} catch(err) { }
|
337
|
+
|
338
|
+
</script>
|
339
|
+
|
340
|
+
<!-- END google analytics -->
|
341
|
+
|
342
|
+
|
343
|
+
<script type="text/javascript">
|
344
|
+
$('.hashflag').trackHashflagUsage();
|
345
|
+
</script>
|
346
|
+
|
347
|
+
|
348
|
+
|
349
|
+
|
350
|
+
|
351
|
+
<div id="notifications"></div>
|
352
|
+
|
353
|
+
|
354
|
+
|
355
|
+
|
356
|
+
|
357
|
+
|
358
|
+
</body>
|
359
|
+
|
360
|
+
</html>
|