charles 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +10 -0
- data/Rakefile +13 -0
- data/bin/charles +23 -0
- data/charles.gemspec +25 -0
- data/lib/charles/document.rb +177 -0
- data/lib/charles/images.rb +77 -0
- data/lib/charles/internal_attributes.rb +40 -0
- data/lib/charles/misc.rb +84 -0
- data/lib/charles/version.rb +3 -0
- data/lib/charles.rb +66 -0
- data/optimise.rb +72 -0
- data/test/articles/20120525_1525_straitstimes.com.content.txt +5 -0
- data/test/articles/20120525_1525_straitstimes.com.html +1929 -0
- data/test/articles/20120525_1534_bbc.co.uk.content.txt +19 -0
- data/test/articles/20120525_1534_bbc.co.uk.html +1777 -0
- data/test/articles/20120525_1727_bbc.co.uk.content.txt +39 -0
- data/test/articles/20120525_1727_bbc.co.uk.html +1889 -0
- data/test/articles/20120525_1730_channelnewsasia.com.content.txt +19 -0
- data/test/articles/20120525_1730_channelnewsasia.com.html +963 -0
- data/test/articles/20120525_1733_channelnewsasia.com.content.txt +19 -0
- data/test/articles/20120525_1733_channelnewsasia.com.html +923 -0
- data/test/articles/20120525_1736_nytimes.com.content.txt +21 -0
- data/test/articles/20120525_1736_nytimes.com.html +856 -0
- data/test/articles/20120525_1743_nytimes.com.content.txt +11 -0
- data/test/articles/20120525_1743_nytimes.com.html +98 -0
- data/test/articles/20120525_1747_techcrunch.com.content.txt +11 -0
- data/test/articles/20120525_1747_techcrunch.com.html +1098 -0
- data/test/articles/20120528_0929_washingtonpost.com.content.txt +23 -0
- data/test/articles/20120528_0929_washingtonpost.com.html +3335 -0
- data/test/articles/20120528_0931_latimes.com.content.txt +45 -0
- data/test/articles/20120528_0931_latimes.com.html +6371 -0
- data/test/articles/20120528_0938_entertainment.time.com.content.txt +31 -0
- data/test/articles/20120528_0938_entertainment.time.com.html +1261 -0
- data/test/articles/20120528_0943_bloomberg.com.content.txt +13 -0
- data/test/articles/20120528_0943_bloomberg.com.html +2874 -0
- data/test/articles/20120528_0947_reuters.com.content.txt +35 -0
- data/test/articles/20120528_0947_reuters.com.html +1563 -0
- data/test/articles/20120528_1106_reuters.com.content.txt +5 -0
- data/test/articles/20120528_1106_reuters.com.html +551 -0
- data/test/articles/20120528_1109_musicthing.blogspot.co.uk.content.txt +19 -0
- data/test/articles/20120528_1109_musicthing.blogspot.co.uk.html +865 -0
- data/test/articles/20120528_1114_mobileinc.co.uk.content.txt +15 -0
- data/test/articles/20120528_1114_mobileinc.co.uk.html +550 -0
- data/test/articles/20120528_1119_forbes.com.content.txt +15 -0
- data/test/articles/20120528_1119_forbes.com.html +1406 -0
- data/test/articles/20120528_1122_techcrunch.com.content.txt +58 -0
- data/test/articles/20120528_1122_techcrunch.com.html +1131 -0
- data/test/articles/20120528_1126_blogs.adobe.com.content.txt +13 -0
- data/test/articles/20120528_1126_blogs.adobe.com.html +303 -0
- data/test/articles/20120528_1142_thestar.com.my.content.txt +27 -0
- data/test/articles/20120528_1142_thestar.com.my.html +943 -0
- data/test/articles/20120528_1146_suntimes.com.content.txt +33 -0
- data/test/articles/20120528_1146_suntimes.com.html +5166 -0
- data/test/articles/20120528_1148_asiaone.com.content.txt +27 -0
- data/test/articles/20120528_1148_asiaone.com.html +1070 -0
- data/test/articles/20120529_1120_online.wsj.com.content.txt +56 -0
- data/test/articles/20120529_1120_online.wsj.com.html +3035 -0
- data/test/articles/20120529_1122_online.wsj.com.content.txt +35 -0
- data/test/articles/20120529_1122_online.wsj.com.html +2725 -0
- data/test/articles/20120529_1127_smh.com.au.content.txt +13 -0
- data/test/articles/20120529_1127_smh.com.au.html +2034 -0
- data/test/articles.yml +221 -0
- data/test/test_charles.rb +70 -0
- metadata +279 -0
data/test/articles.yml
ADDED
@@ -0,0 +1,221 @@
|
|
1
|
+
-
|
2
|
+
:file: '20120525_1525_straitstimes.com'
|
3
|
+
:url: 'http://www.straitstimes.com/BreakingNews/Singapore/Story/STIStory_802924.html'
|
4
|
+
:expected:
|
5
|
+
:title: 'PAP: Chance to start afresh in Hougang'
|
6
|
+
:lede: 'PM Lee, Khaw Boon Wan urge residents to back young candidate'
|
7
|
+
:author: 'RACHEL CHANG'
|
8
|
+
:image: 'http://www.straitstimes.com/STI/STIMEDIA/image/20120524/ST_IMAGES_RALLYSHARPe.jpg'
|
9
|
+
-
|
10
|
+
:file: '20120525_1534_bbc.co.uk'
|
11
|
+
:url: 'http://www.bbc.co.uk/news/world-africa-18202545'
|
12
|
+
:expected:
|
13
|
+
:title: 'French President Hollande in Afghanistan on troop visit'
|
14
|
+
:lede: 'French President Francois Hollande has arrived in Afghanistan on a previously unannounced visit.'
|
15
|
+
:author: nil
|
16
|
+
:image: 'http://news.bbcimg.co.uk/media/images/60479000/jpg/_60479902_hollandeafp.jpg'
|
17
|
+
:images:
|
18
|
+
- 'http://news.bbcimg.co.uk/media/images/58929000/gif/_58929603_afghanistan_troops_464_jan2012.gif'
|
19
|
+
-
|
20
|
+
:file: '20120525_1727_bbc.co.uk'
|
21
|
+
:url: 'http://www.bbc.co.uk/news/health-18190352'
|
22
|
+
:expected:
|
23
|
+
:title: 'Male pill: gene discovery may lead to contraceptive'
|
24
|
+
:lede: 'It may be possible to develop a new male contraceptive pill after researchers in Edinburgh identified a gene critical for the production of healthy sperm.'
|
25
|
+
:author: nil
|
26
|
+
:image: 'http://news.bbcimg.co.uk/media/images/60456000/jpg/_60456207_c0126522-human_sperm_sem_x2.jpg'
|
27
|
+
-
|
28
|
+
:file: '20120525_1730_channelnewsasia.com'
|
29
|
+
:url: 'http://www.channelnewsasia.com/stories/singaporelocalnews/view/1203419/1/.html'
|
30
|
+
:expected:
|
31
|
+
:title: 'Cooling-off Day for Hougang By-Election'
|
32
|
+
:lede: 'SINGAPORE: It is Cooling-off Day for the Hougang By-Election on Friday.'
|
33
|
+
:author: nil
|
34
|
+
:image: 'http://www.channelnewsasia.com/components/display_image.php?id=494293'
|
35
|
+
-
|
36
|
+
:file: '20120525_1733_channelnewsasia.com'
|
37
|
+
:url: 'http://www.channelnewsasia.com/stories/singaporebusinessnews/view/1203496/1/.html'
|
38
|
+
:expected:
|
39
|
+
:title: "Singapore's manufacturing output contracts in April"
|
40
|
+
:lede: "SINGAPORE: Singapore's manufacturing output contracted unexpectedly for the second consecutive month in April, dragged down by a drop in production from the biomedical and electronics sectors."
|
41
|
+
:author: nil
|
42
|
+
:image: 'http://www.channelnewsasia.com/components/display_image.php?id=466095'
|
43
|
+
-
|
44
|
+
:file: '20120525_1736_nytimes.com'
|
45
|
+
:url: 'http://www.nytimes.com/2012/05/26/world/middleeast/egypt-presidential-election-runoff.html?_r=1&ref=global-home#h[]'
|
46
|
+
:expected:
|
47
|
+
:title: "Muslim Brotherhood Candidate to Face Former Prime Minister in Egyptian Runoff"
|
48
|
+
:lede: "CAIRO � The Islamist candidate of the Muslim Brotherhood will face former President Hosni Mubarak�s last prime minister in a runoff to become Egypt�s first freely elected president, several independent vote counts concluded Friday morning."
|
49
|
+
:author: 'DAVID D. KIRKPATRICK'
|
50
|
+
:image: 'http://graphics8.nytimes.com/images/2012/05/26/global-home/26egypt-image/26egypt-image-articleInline.jpg'
|
51
|
+
-
|
52
|
+
:file: '20120525_1743_nytimes.com'
|
53
|
+
:url: 'http://bits.blogs.nytimes.com/2012/05/24/mark-zuckerberg-officially-a-billionaire/?ref=technology'
|
54
|
+
:expected:
|
55
|
+
:title: "Mark Zuckerberg Officially a Billionaire"
|
56
|
+
:lede: "Mark Zuckerberg, the founder of Facebook, is now officially a billionaire. Until now, Mr. Zuckerberg has been staggeringly rich only on paper."
|
57
|
+
:author: 'NICK BILTON'
|
58
|
+
:image: 'http://graphics8.nytimes.com/images/2012/05/23/technology/bits-zuck-stock/bits-zuck-stock-tmagArticle.jpg'
|
59
|
+
-
|
60
|
+
:file: '20120525_1747_techcrunch.com'
|
61
|
+
:url: 'http://techcrunch.com/2012/05/24/manpacks-startup-perks-condoms/'
|
62
|
+
:expected:
|
63
|
+
:title: "Manpacks Launches Its Startup Perks Program With Free Condoms"
|
64
|
+
:lede: "Well, that�s one way to get some attention for your product launch."
|
65
|
+
:author: 'Anthony Ha'
|
66
|
+
:image: 'http://tctechcrunch2011.files.wordpress.com/2012/05/photo-51.jpg?w=288'
|
67
|
+
-
|
68
|
+
:file: '20120528_0929_washingtonpost.com'
|
69
|
+
:url: 'http://www.washingtonpost.com/world/middle_east/un-security-council-blames-syrian-forces-for-shelling-houla-condemns-attacks-on-civilians/2012/05/27/gJQAjoyLvU_story.html'
|
70
|
+
:expected:
|
71
|
+
:title: "UN Security Council blames Syrian forces for shelling Houla, condemns attacks on civilians"
|
72
|
+
:lede: "UNITED NATIONS � The U.N. Security Council on Sunday blamed the Syrian government for attacking residential areas of the town of Houla with artillery and tank shelling and also condemned the close-range killings of civilians there � but avoided saying who was responsible for the massacre of more than 100 men, women and children."
|
73
|
+
:author: 'Associated Press'
|
74
|
+
:image: nil
|
75
|
+
-
|
76
|
+
:file: '20120528_0931_latimes.com'
|
77
|
+
:url: 'http://www.latimes.com/sports/motorracing/la-sp-indy-500-20120528,0,357526.story'
|
78
|
+
:expected:
|
79
|
+
:title: "Dario Franchitti is last leader standing, wins his third Indy 500"
|
80
|
+
:lede: "Dario Franchitti rebuffs Takuma Sato's frantic last-lap bid to win Indy 500 with a record 34 lead changes. It's a sentimental victory as the race honors the late Dan Wheldon, Franchitti's close friend."
|
81
|
+
:author: 'Jim Peltz'
|
82
|
+
:image: 'http://www.trbimg.com/img-4fc2ce93/turbine/la-sp-indy-500-20120528-001/600'
|
83
|
+
-
|
84
|
+
:file: '20120528_0938_entertainment.time.com'
|
85
|
+
:url: 'http://entertainment.time.com/2012/05/27/the-palme-damour/?iid=ent-main-lede'
|
86
|
+
:expected:
|
87
|
+
:title: "The Palme d'Amour: Director Michael Haneke Takes Cannes' Top Prize�Again"
|
88
|
+
:lede: "Michael Haneke's quietly magnificent story of love on the brink of death takes the top prize, while seven American films oozing with star quality get no love at all"
|
89
|
+
:author: 'Richard Corliss'
|
90
|
+
:image: 'http://timeentertainment.files.wordpress.com/2012/05/amour.jpg?w=600&h=400&crop=1'
|
91
|
+
-
|
92
|
+
:file: '20120528_0943_bloomberg.com'
|
93
|
+
:url: 'http://www.bloomberg.com/news/2012-05-28/euro-u-s-equity-futures-advance-on-greece-optimism.html'
|
94
|
+
:expected:
|
95
|
+
:title: "Euro, U.S. Equity Futures Advance on Greece Optimism"
|
96
|
+
:lede: "The euro strengthened for the first time in five days, U.S. equity-index futures advanced and oil gained after Greek opinion polls showed voters backing parties that support the European Union�s bailout, easing concern the country will exit the currency bloc."
|
97
|
+
:author: 'Bloomberg News'
|
98
|
+
:image: nil
|
99
|
+
-
|
100
|
+
:file: '20120528_0947_reuters.com'
|
101
|
+
:url: 'http://www.reuters.com/article/2012/05/27/us-nuclear-iran-uranium-idUSBRE84O0SN20120527'
|
102
|
+
:expected:
|
103
|
+
:title: "Iran not ready for visit to suspect nuclear site"
|
104
|
+
:lede: "The U.N. nuclear watchdog has not yet given good enough reasons to visit an Iranian site where it suspects there may have been experiments for developing nuclear weapons, Iranian media said."
|
105
|
+
:author: 'Marcus George'
|
106
|
+
:image: 'http://s1.reutersmedia.net/resources/r/?m=02&d=20120527&t=2&i=612137389&w=&fh=&fw=&ll=700&pl=300&r=CBRE84O17PO00'
|
107
|
+
:images:
|
108
|
+
- 'http://s1.reutersmedia.net/resources/r/?m=02&d=20120527&t=2&i=612137390&w=&fh=&fw=&ll=700&pl=300&r=CBRE84O17PP00'
|
109
|
+
-
|
110
|
+
:file: '20120528_1106_reuters.com'
|
111
|
+
:url: 'https://www.eff.org/deeplinks/2012/05/megaupload-user-asks-court-files-back-again'
|
112
|
+
:expected:
|
113
|
+
:title: "Megaupload User Asks Court for Files Back. Again."
|
114
|
+
:lede: "You may remember that EFF�s client, Kyle Goodwin, asked the court to return the legal files he lost when Megaupload was seized last January. Since then, we�ve been to court, both for a hearing and a mediation, and nothing has changed. The key problem: the government has failed to help third parties like Kyle get access to their data. So we have no choice but to go back to court."
|
115
|
+
:author: 'Julie Samuels'
|
116
|
+
:image: nil
|
117
|
+
-
|
118
|
+
:file: '20120528_1109_musicthing.blogspot.co.uk'
|
119
|
+
:url: 'http://musicthing.blogspot.co.uk/2005/05/tiny-music-makers-pt-3-thx-sound.html'
|
120
|
+
:expected:
|
121
|
+
:title: "TINY MUSIC MAKERS: Pt 3: The THX Sound"
|
122
|
+
:lede: '"I like to say that the THX sound is the most widely-recognized piece of computer-generated music in the world," says Andy Moorer. "This may or may not be true, but it sounds cool!"'
|
123
|
+
:author: 'Tom Whitwell'
|
124
|
+
:image: 'http://photos1.blogger.com/blogger/4749/510/1600/moorer.jpg'
|
125
|
+
-
|
126
|
+
:file: '20120528_1114_mobileinc.co.uk'
|
127
|
+
:url: 'http://www.mobileinc.co.uk/2012/03/ive-started-a-newsletter-the-mobile-inc-digest/'
|
128
|
+
:expected:
|
129
|
+
:title: "I�ve Started A Newsletter � The Mobile Inc Digest"
|
130
|
+
:lede: "I�m spending so much time designing, working and recovering (shoulder surgery ouch) that there�s less and less time to blog right now. However I�m reading and researching more than ever."
|
131
|
+
:author: 'Murat'
|
132
|
+
:image: nil
|
133
|
+
-
|
134
|
+
:file: '20120528_1119_forbes.com'
|
135
|
+
:url: 'http://www.forbes.com/sites/haydnshaughnessy/2012/05/26/why-crisis-in-spain-this-week-should-have-us-more-worried-than-greece/'
|
136
|
+
:expected:
|
137
|
+
:title: "Why Crisis in Spain This Week Became More Important Than Greece"
|
138
|
+
:lede: "Local debt is the big untold story of the Euro crisis and, if that was not apparent before, it became glaringly so when Catalonia�s President this week told the world his autonomous Catalan Government would struggle to meet its bills at the end of this month."
|
139
|
+
:author: 'Haydn Shaughnessy'
|
140
|
+
:image: nil
|
141
|
+
-
|
142
|
+
:file: '20120528_1122_techcrunch.com'
|
143
|
+
:url: 'http://techcrunch.com/2012/05/27/hey-kids-get-off-my-lawn-the-once-and-future-visual-programming-environment/'
|
144
|
+
:expected:
|
145
|
+
:title: "Hey Kids, Get Off My Lawn: The Once And Future Visual Programming Environment"
|
146
|
+
:lede: ""
|
147
|
+
:author: 'Kwindla Hultman Kramer'
|
148
|
+
:image: 'http://tctechcrunch2011.files.wordpress.com/2012/05/seededfertilizedlawn.jpg?w=288'
|
149
|
+
-
|
150
|
+
:file: '20120528_1126_blogs.adobe.com'
|
151
|
+
:url: 'http://blogs.adobe.com/digitalmarketing/personalization/conversion-optimization/understand-the-math-behind-it-all-bayesian-statistics/'
|
152
|
+
:expected:
|
153
|
+
:title: "Understand the Math Behind it All: Bayesian Statistics"
|
154
|
+
:lede: "Most mar�ket�ing peo�ple have only a pass�ing inter�ac�tion with sta�tis�tics, and often times only under�stand it as a mea�sure of how it has impacted their daily life. One of the funny things peo�ple don�t real�ize is that there are two com�pletely dif�fer�ent com�pet�ing schools of thought when it comes to sta�tis�tics. Most peo�ple are famil�iar with fre�quen�tist sta�tis�tics, hav�ing dealt with things like nor�mal dis�tri�b�u�tion, bell curves, and estab�lished prob�a�bil�i�ties. The other school, Bayesian sta�tis�tics, is a realm that fewer peo�ple are famil�iar with, but just as applic�a�ble. In fact, the move over the last few years is for more peo�ple to change from the fre�quen�tist model to Bayesian techniques."
|
155
|
+
:author: 'Andrew Anderson'
|
156
|
+
:image: nil
|
157
|
+
-
|
158
|
+
:file: '20120528_1142_thestar.com.my'
|
159
|
+
:url: 'http://thestar.com.my/news/story.asp?file=/2012/5/27/nation/11368993&sec=nation'
|
160
|
+
:expected:
|
161
|
+
:title: "Videogame music writers on tour here at Istana Budaya"
|
162
|
+
:lede: "PETALING JAYA: Computer game soundtracks are as recognisable as movie scores these days and have garnered a huge following of fans."
|
163
|
+
:author: 'TAN KIT HOONG and CHONG JINN XIUNG'
|
164
|
+
:image: 'http://thestar.com.my/archives/2012/5/27/nation/n_10Tallarico.jpg'
|
165
|
+
-
|
166
|
+
:file: '20120528_1146_suntimes.com'
|
167
|
+
:url: 'http://www.suntimes.com/sports/autoracing/12812647-419/dario-franchitti-wins-his-third-indy-500-in-honor-of-late-friend-dan-wheldon.html'
|
168
|
+
:expected:
|
169
|
+
:title: "Dario Franchitti wins his third Indy 500 in honor of late friend Dan Wheldon"
|
170
|
+
:lede: "INDIANAPOLIS � You can debate whether the Indianapolis 500 is the world�s greatest race."
|
171
|
+
:author: 'HERB GOULD'
|
172
|
+
:image: 'http://www.suntimes.com/csp/cms/sites/dt.common.streams.StreamServer.cls?STREAMOID=G9CD0HBsfM6Ez9hJq$J_vs$daE2N3K4ZzOUsqbU5sYtfyOFkwIFfaZvhenwZq5N7WCsjLu883Ygn4B49Lvm9bPe2QeMKQdVeZmXF$9l$4uCZ8QDXhaHEp3rvzXRJFdy0KqPHLoMevcTLo3h8xh70Y6N_U_CryOsw6FTOdKL_jpQ-&CONTENTTYPE=image/jpeg'
|
173
|
+
-
|
174
|
+
:file: '20120528_1148_asiaone.com'
|
175
|
+
:url: 'http://news.asiaone.com/News/Latest%2BNews/Singapore/Story/A1Story20120528-348743.html'
|
176
|
+
:expected:
|
177
|
+
:title: "Choo: Back on the ground today"
|
178
|
+
:lede: "He may not have been successful in the Hougang by-election but, for People's Action Party's (PAP's) Desmond Choo, it is back to work today, helping residents on the ground."
|
179
|
+
:author: 'Gwendolyn Ng'
|
180
|
+
:image: 'http://news.asiaone.com/A1MEDIA/news/05May12/images/20120528.083508_dchood.jpg'
|
181
|
+
-
|
182
|
+
:file: '20120529_1120_online.wsj.com'
|
183
|
+
:url: 'http://online.wsj.com/article/SB10001424052702303674004577433160886451978.html'
|
184
|
+
:expected:
|
185
|
+
:title: "Storied Law Firm Dewey Files Chapter 11"
|
186
|
+
:lede: "The embattled New York law firm Dewey & LeBoeuf LLP has filed for bankruptcy protection, a move that effectively ends what had been at its height a 1,300-lawyer global enterprise and marks one of the largest law-firm failures in U.S. history."
|
187
|
+
:author: 'JENNIFER SMITH and ASHBY JONES'
|
188
|
+
:image: 'http://si.wsj.net/public/resources/images/OB-SV369_dewey0_D_20120503161002.jpg'
|
189
|
+
-
|
190
|
+
:file: '20120529_1122_online.wsj.com'
|
191
|
+
:url: 'http://online.wsj.com/article/SB10001424052702303674004577432412894441148.html?mod=WSJ_Tech_Europe_INTL_LSMODULE'
|
192
|
+
:expected:
|
193
|
+
:title: "Advanced Malware Targets Middle East"
|
194
|
+
:lede: 'Computer malware described as "the most sophisticated cyberweapon yet unleashed" has been uncovered in computers in the Middle East and may have infected machines in Europe, according to reports from antivirus researchers and software makers in Russia, Hungary and Ireland.'
|
195
|
+
:author: nil
|
196
|
+
:image: nil
|
197
|
+
-
|
198
|
+
:file: '20120529_1127_smh.com.au'
|
199
|
+
:url: 'http://www.smh.com.au/digital-life/digital-life-news/mark-zuckerberg-makes-surprise-cameo-on-chinese-tv-20120529-1zfw2.html'
|
200
|
+
:expected:
|
201
|
+
:title: "Mark Zuckerberg makes surprise cameo on Chinese TV"
|
202
|
+
:lede: "BEIJING � Social media sites and blogs have lit up after eagle-eyed viewers spotted a surprise cameo in a Chinese TV documentary about the country's police force: Facebook founder Mark Zuckerberg and his now-wife, Priscilla Chan."
|
203
|
+
:author: nil
|
204
|
+
:image: 'http://images.smh.com.au/2012/05/29/3333176/art_zuckerbergs-420x0.jpg'
|
205
|
+
-
|
206
|
+
:file: ''
|
207
|
+
:url: ''
|
208
|
+
:expected:
|
209
|
+
:title: ""
|
210
|
+
:lede: ""
|
211
|
+
:author: ''
|
212
|
+
:image: ''
|
213
|
+
-
|
214
|
+
:file: ''
|
215
|
+
:url: ''
|
216
|
+
:expected:
|
217
|
+
:title: ""
|
218
|
+
:lede: ""
|
219
|
+
:author: ''
|
220
|
+
:image: ''
|
221
|
+
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'charles'
|
3
|
+
require 'yaml'
|
4
|
+
#require 'active_support/testing/assertions'
|
5
|
+
|
6
|
+
TEST_ARTICLES = YAML.load_file("test/articles.yml")
|
7
|
+
|
8
|
+
Charles.options[:tmp_path] = File.dirname(__FILE__) + "/tmp"
|
9
|
+
|
10
|
+
class CharlesTest < Test::Unit::TestCase
|
11
|
+
#include ActiveSupport::Testing::Assertions
|
12
|
+
|
13
|
+
def setup
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_articles
|
17
|
+
_scores = {:content => [], :title => [], :image => []}
|
18
|
+
TEST_ARTICLES.each{|article|
|
19
|
+
next if article[:file].empty?
|
20
|
+
input = File.read("test/articles/#{article[:file]}.html")
|
21
|
+
document = Charles::Document.new(input, :url => article[:url])
|
22
|
+
result = document.content
|
23
|
+
expected = File.read("test/articles/#{article[:file]}.content.txt")
|
24
|
+
content_score = Charles::Misc.compare_strings(result, expected)
|
25
|
+
#pp [content_score, result, expected, article[:url]] if content_score < 0.1
|
26
|
+
_scores[:content] << content_score
|
27
|
+
title_score = Charles::Misc.compare_strings(document.title, article[:expected][:title])
|
28
|
+
#pp [title_score, document.title, article[:expected][:title], article[:url]] if title_score < 0.1
|
29
|
+
_scores[:title] << title_score
|
30
|
+
|
31
|
+
if article[:expected][:image]
|
32
|
+
_scores[:image] << (document.images.index(article[:expected][:image]) ? 1 : 0)
|
33
|
+
end
|
34
|
+
}
|
35
|
+
|
36
|
+
assert _scores[:content].select{|score| score < 0.5}.mean > 0.2
|
37
|
+
assert _scores[:content].select{|score| score < 0.1}.size < 4
|
38
|
+
assert _scores[:content].select{|score| score < 0.01}.size < 1
|
39
|
+
assert _scores[:title].select{|score| score < 0.5}.mean > 0.15
|
40
|
+
assert _scores[:title].select{|score| score < 0.1}.size < 5
|
41
|
+
assert _scores[:title].select{|score| score < 0.01}.size < 2
|
42
|
+
assert _scores[:image].mean > 0.4
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_clean_title
|
46
|
+
article = TEST_ARTICLES.detect{|article| article[:url] == 'http://online.wsj.com/article/SB10001424052702303674004577433160886451978.html'}
|
47
|
+
input = File.read("test/articles/#{article[:file]}.html")
|
48
|
+
sample_titles = ['Former ML closer Armando Benitez signs with Ducks - WSJ.com',
|
49
|
+
'The Top 10 Clean-Tech Companies - WSJ.com',
|
50
|
+
'Book Review: Internal Time - WSJ.com',
|
51
|
+
'NASA Working With Private Sector � Letters to the Editor - WSJ.com',
|
52
|
+
'NASA Working With Private Sector � Letters to the Editor - WSJ.com',
|
53
|
+
'San Francisco Symphony Orchestra | Radicals Ready for the Road - WSJ.com']
|
54
|
+
document = Charles::Document.new(input, :url => article[:url], :sample_titles => sample_titles)
|
55
|
+
assert document.title.include?('WSJ.com')
|
56
|
+
assert !document.clean_title.include?('WSJ.com')
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_filtered_images
|
60
|
+
article = TEST_ARTICLES.detect{|article| article[:url] == 'http://online.wsj.com/article/SB10001424052702303674004577433160886451978.html'}
|
61
|
+
input = File.read("test/articles/#{article[:file]}.html")
|
62
|
+
document = Charles::Document.new(input, :url => article[:url])
|
63
|
+
assert document.filtered_images.size > 3
|
64
|
+
assert document.filtered_images.last[:data].size > 1000
|
65
|
+
assert document.filtered_images.last[:width] > 100
|
66
|
+
assert document.filtered_images.last[:height] > 100
|
67
|
+
end
|
68
|
+
|
69
|
+
|
70
|
+
end
|
metadata
ADDED
@@ -0,0 +1,279 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: charles
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 29
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
- 1
|
10
|
+
version: 0.0.1
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Jason Ling Xiaowei
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2012-05-30 00:00:00 Z
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: ferret
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
none: false
|
25
|
+
requirements:
|
26
|
+
- - ">="
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
hash: 3
|
29
|
+
segments:
|
30
|
+
- 0
|
31
|
+
version: "0"
|
32
|
+
type: :runtime
|
33
|
+
version_requirements: *id001
|
34
|
+
- !ruby/object:Gem::Dependency
|
35
|
+
name: nokogiri
|
36
|
+
prerelease: false
|
37
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
38
|
+
none: false
|
39
|
+
requirements:
|
40
|
+
- - ">="
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
hash: 3
|
43
|
+
segments:
|
44
|
+
- 0
|
45
|
+
version: "0"
|
46
|
+
type: :runtime
|
47
|
+
version_requirements: *id002
|
48
|
+
- !ruby/object:Gem::Dependency
|
49
|
+
name: htmlentities
|
50
|
+
prerelease: false
|
51
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
52
|
+
none: false
|
53
|
+
requirements:
|
54
|
+
- - ">="
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
hash: 3
|
57
|
+
segments:
|
58
|
+
- 0
|
59
|
+
version: "0"
|
60
|
+
type: :runtime
|
61
|
+
version_requirements: *id003
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: mechanize
|
64
|
+
prerelease: false
|
65
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
66
|
+
none: false
|
67
|
+
requirements:
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
hash: 3
|
71
|
+
segments:
|
72
|
+
- 0
|
73
|
+
version: "0"
|
74
|
+
type: :runtime
|
75
|
+
version_requirements: *id004
|
76
|
+
- !ruby/object:Gem::Dependency
|
77
|
+
name: activesupport
|
78
|
+
prerelease: false
|
79
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
80
|
+
none: false
|
81
|
+
requirements:
|
82
|
+
- - ">="
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
hash: 3
|
85
|
+
segments:
|
86
|
+
- 0
|
87
|
+
version: "0"
|
88
|
+
type: :runtime
|
89
|
+
version_requirements: *id005
|
90
|
+
- !ruby/object:Gem::Dependency
|
91
|
+
name: rack
|
92
|
+
prerelease: false
|
93
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
94
|
+
none: false
|
95
|
+
requirements:
|
96
|
+
- - ">="
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
hash: 3
|
99
|
+
segments:
|
100
|
+
- 0
|
101
|
+
version: "0"
|
102
|
+
type: :runtime
|
103
|
+
version_requirements: *id006
|
104
|
+
- !ruby/object:Gem::Dependency
|
105
|
+
name: imagesize
|
106
|
+
prerelease: false
|
107
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
108
|
+
none: false
|
109
|
+
requirements:
|
110
|
+
- - ">="
|
111
|
+
- !ruby/object:Gem::Version
|
112
|
+
hash: 3
|
113
|
+
segments:
|
114
|
+
- 0
|
115
|
+
version: "0"
|
116
|
+
type: :runtime
|
117
|
+
version_requirements: *id007
|
118
|
+
description: Charles the Content Extractor
|
119
|
+
email:
|
120
|
+
- jason@jeyel.com
|
121
|
+
executables:
|
122
|
+
- charles
|
123
|
+
extensions: []
|
124
|
+
|
125
|
+
extra_rdoc_files: []
|
126
|
+
|
127
|
+
files:
|
128
|
+
- .gitignore
|
129
|
+
- Gemfile
|
130
|
+
- LICENSE
|
131
|
+
- README.md
|
132
|
+
- Rakefile
|
133
|
+
- bin/charles
|
134
|
+
- charles.gemspec
|
135
|
+
- lib/charles.rb
|
136
|
+
- lib/charles/document.rb
|
137
|
+
- lib/charles/images.rb
|
138
|
+
- lib/charles/internal_attributes.rb
|
139
|
+
- lib/charles/misc.rb
|
140
|
+
- lib/charles/version.rb
|
141
|
+
- optimise.rb
|
142
|
+
- test/articles.yml
|
143
|
+
- test/articles/20120525_1525_straitstimes.com.content.txt
|
144
|
+
- test/articles/20120525_1525_straitstimes.com.html
|
145
|
+
- test/articles/20120525_1534_bbc.co.uk.content.txt
|
146
|
+
- test/articles/20120525_1534_bbc.co.uk.html
|
147
|
+
- test/articles/20120525_1727_bbc.co.uk.content.txt
|
148
|
+
- test/articles/20120525_1727_bbc.co.uk.html
|
149
|
+
- test/articles/20120525_1730_channelnewsasia.com.content.txt
|
150
|
+
- test/articles/20120525_1730_channelnewsasia.com.html
|
151
|
+
- test/articles/20120525_1733_channelnewsasia.com.content.txt
|
152
|
+
- test/articles/20120525_1733_channelnewsasia.com.html
|
153
|
+
- test/articles/20120525_1736_nytimes.com.content.txt
|
154
|
+
- test/articles/20120525_1736_nytimes.com.html
|
155
|
+
- test/articles/20120525_1743_nytimes.com.content.txt
|
156
|
+
- test/articles/20120525_1743_nytimes.com.html
|
157
|
+
- test/articles/20120525_1747_techcrunch.com.content.txt
|
158
|
+
- test/articles/20120525_1747_techcrunch.com.html
|
159
|
+
- test/articles/20120528_0929_washingtonpost.com.content.txt
|
160
|
+
- test/articles/20120528_0929_washingtonpost.com.html
|
161
|
+
- test/articles/20120528_0931_latimes.com.content.txt
|
162
|
+
- test/articles/20120528_0931_latimes.com.html
|
163
|
+
- test/articles/20120528_0938_entertainment.time.com.content.txt
|
164
|
+
- test/articles/20120528_0938_entertainment.time.com.html
|
165
|
+
- test/articles/20120528_0943_bloomberg.com.content.txt
|
166
|
+
- test/articles/20120528_0943_bloomberg.com.html
|
167
|
+
- test/articles/20120528_0947_reuters.com.content.txt
|
168
|
+
- test/articles/20120528_0947_reuters.com.html
|
169
|
+
- test/articles/20120528_1106_reuters.com.content.txt
|
170
|
+
- test/articles/20120528_1106_reuters.com.html
|
171
|
+
- test/articles/20120528_1109_musicthing.blogspot.co.uk.content.txt
|
172
|
+
- test/articles/20120528_1109_musicthing.blogspot.co.uk.html
|
173
|
+
- test/articles/20120528_1114_mobileinc.co.uk.content.txt
|
174
|
+
- test/articles/20120528_1114_mobileinc.co.uk.html
|
175
|
+
- test/articles/20120528_1119_forbes.com.content.txt
|
176
|
+
- test/articles/20120528_1119_forbes.com.html
|
177
|
+
- test/articles/20120528_1122_techcrunch.com.content.txt
|
178
|
+
- test/articles/20120528_1122_techcrunch.com.html
|
179
|
+
- test/articles/20120528_1126_blogs.adobe.com.content.txt
|
180
|
+
- test/articles/20120528_1126_blogs.adobe.com.html
|
181
|
+
- test/articles/20120528_1142_thestar.com.my.content.txt
|
182
|
+
- test/articles/20120528_1142_thestar.com.my.html
|
183
|
+
- test/articles/20120528_1146_suntimes.com.content.txt
|
184
|
+
- test/articles/20120528_1146_suntimes.com.html
|
185
|
+
- test/articles/20120528_1148_asiaone.com.content.txt
|
186
|
+
- test/articles/20120528_1148_asiaone.com.html
|
187
|
+
- test/articles/20120529_1120_online.wsj.com.content.txt
|
188
|
+
- test/articles/20120529_1120_online.wsj.com.html
|
189
|
+
- test/articles/20120529_1122_online.wsj.com.content.txt
|
190
|
+
- test/articles/20120529_1122_online.wsj.com.html
|
191
|
+
- test/articles/20120529_1127_smh.com.au.content.txt
|
192
|
+
- test/articles/20120529_1127_smh.com.au.html
|
193
|
+
- test/test_charles.rb
|
194
|
+
homepage: https://github.com/jlxw/charles
|
195
|
+
licenses: []
|
196
|
+
|
197
|
+
post_install_message:
|
198
|
+
rdoc_options: []
|
199
|
+
|
200
|
+
require_paths:
|
201
|
+
- lib
|
202
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
203
|
+
none: false
|
204
|
+
requirements:
|
205
|
+
- - ">="
|
206
|
+
- !ruby/object:Gem::Version
|
207
|
+
hash: 3
|
208
|
+
segments:
|
209
|
+
- 0
|
210
|
+
version: "0"
|
211
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
212
|
+
none: false
|
213
|
+
requirements:
|
214
|
+
- - ">="
|
215
|
+
- !ruby/object:Gem::Version
|
216
|
+
hash: 3
|
217
|
+
segments:
|
218
|
+
- 0
|
219
|
+
version: "0"
|
220
|
+
requirements: []
|
221
|
+
|
222
|
+
rubyforge_project:
|
223
|
+
rubygems_version: 1.8.13
|
224
|
+
signing_key:
|
225
|
+
specification_version: 3
|
226
|
+
summary: Charles the Content Extractor
|
227
|
+
test_files:
|
228
|
+
- test/articles.yml
|
229
|
+
- test/articles/20120525_1525_straitstimes.com.content.txt
|
230
|
+
- test/articles/20120525_1525_straitstimes.com.html
|
231
|
+
- test/articles/20120525_1534_bbc.co.uk.content.txt
|
232
|
+
- test/articles/20120525_1534_bbc.co.uk.html
|
233
|
+
- test/articles/20120525_1727_bbc.co.uk.content.txt
|
234
|
+
- test/articles/20120525_1727_bbc.co.uk.html
|
235
|
+
- test/articles/20120525_1730_channelnewsasia.com.content.txt
|
236
|
+
- test/articles/20120525_1730_channelnewsasia.com.html
|
237
|
+
- test/articles/20120525_1733_channelnewsasia.com.content.txt
|
238
|
+
- test/articles/20120525_1733_channelnewsasia.com.html
|
239
|
+
- test/articles/20120525_1736_nytimes.com.content.txt
|
240
|
+
- test/articles/20120525_1736_nytimes.com.html
|
241
|
+
- test/articles/20120525_1743_nytimes.com.content.txt
|
242
|
+
- test/articles/20120525_1743_nytimes.com.html
|
243
|
+
- test/articles/20120525_1747_techcrunch.com.content.txt
|
244
|
+
- test/articles/20120525_1747_techcrunch.com.html
|
245
|
+
- test/articles/20120528_0929_washingtonpost.com.content.txt
|
246
|
+
- test/articles/20120528_0929_washingtonpost.com.html
|
247
|
+
- test/articles/20120528_0931_latimes.com.content.txt
|
248
|
+
- test/articles/20120528_0931_latimes.com.html
|
249
|
+
- test/articles/20120528_0938_entertainment.time.com.content.txt
|
250
|
+
- test/articles/20120528_0938_entertainment.time.com.html
|
251
|
+
- test/articles/20120528_0943_bloomberg.com.content.txt
|
252
|
+
- test/articles/20120528_0943_bloomberg.com.html
|
253
|
+
- test/articles/20120528_0947_reuters.com.content.txt
|
254
|
+
- test/articles/20120528_0947_reuters.com.html
|
255
|
+
- test/articles/20120528_1106_reuters.com.content.txt
|
256
|
+
- test/articles/20120528_1106_reuters.com.html
|
257
|
+
- test/articles/20120528_1109_musicthing.blogspot.co.uk.content.txt
|
258
|
+
- test/articles/20120528_1109_musicthing.blogspot.co.uk.html
|
259
|
+
- test/articles/20120528_1114_mobileinc.co.uk.content.txt
|
260
|
+
- test/articles/20120528_1114_mobileinc.co.uk.html
|
261
|
+
- test/articles/20120528_1119_forbes.com.content.txt
|
262
|
+
- test/articles/20120528_1119_forbes.com.html
|
263
|
+
- test/articles/20120528_1122_techcrunch.com.content.txt
|
264
|
+
- test/articles/20120528_1122_techcrunch.com.html
|
265
|
+
- test/articles/20120528_1126_blogs.adobe.com.content.txt
|
266
|
+
- test/articles/20120528_1126_blogs.adobe.com.html
|
267
|
+
- test/articles/20120528_1142_thestar.com.my.content.txt
|
268
|
+
- test/articles/20120528_1142_thestar.com.my.html
|
269
|
+
- test/articles/20120528_1146_suntimes.com.content.txt
|
270
|
+
- test/articles/20120528_1146_suntimes.com.html
|
271
|
+
- test/articles/20120528_1148_asiaone.com.content.txt
|
272
|
+
- test/articles/20120528_1148_asiaone.com.html
|
273
|
+
- test/articles/20120529_1120_online.wsj.com.content.txt
|
274
|
+
- test/articles/20120529_1120_online.wsj.com.html
|
275
|
+
- test/articles/20120529_1122_online.wsj.com.content.txt
|
276
|
+
- test/articles/20120529_1122_online.wsj.com.html
|
277
|
+
- test/articles/20120529_1127_smh.com.au.content.txt
|
278
|
+
- test/articles/20120529_1127_smh.com.au.html
|
279
|
+
- test/test_charles.rb
|