charles 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. data/.gitignore +17 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE +22 -0
  4. data/README.md +10 -0
  5. data/Rakefile +13 -0
  6. data/bin/charles +23 -0
  7. data/charles.gemspec +25 -0
  8. data/lib/charles/document.rb +177 -0
  9. data/lib/charles/images.rb +77 -0
  10. data/lib/charles/internal_attributes.rb +40 -0
  11. data/lib/charles/misc.rb +84 -0
  12. data/lib/charles/version.rb +3 -0
  13. data/lib/charles.rb +66 -0
  14. data/optimise.rb +72 -0
  15. data/test/articles/20120525_1525_straitstimes.com.content.txt +5 -0
  16. data/test/articles/20120525_1525_straitstimes.com.html +1929 -0
  17. data/test/articles/20120525_1534_bbc.co.uk.content.txt +19 -0
  18. data/test/articles/20120525_1534_bbc.co.uk.html +1777 -0
  19. data/test/articles/20120525_1727_bbc.co.uk.content.txt +39 -0
  20. data/test/articles/20120525_1727_bbc.co.uk.html +1889 -0
  21. data/test/articles/20120525_1730_channelnewsasia.com.content.txt +19 -0
  22. data/test/articles/20120525_1730_channelnewsasia.com.html +963 -0
  23. data/test/articles/20120525_1733_channelnewsasia.com.content.txt +19 -0
  24. data/test/articles/20120525_1733_channelnewsasia.com.html +923 -0
  25. data/test/articles/20120525_1736_nytimes.com.content.txt +21 -0
  26. data/test/articles/20120525_1736_nytimes.com.html +856 -0
  27. data/test/articles/20120525_1743_nytimes.com.content.txt +11 -0
  28. data/test/articles/20120525_1743_nytimes.com.html +98 -0
  29. data/test/articles/20120525_1747_techcrunch.com.content.txt +11 -0
  30. data/test/articles/20120525_1747_techcrunch.com.html +1098 -0
  31. data/test/articles/20120528_0929_washingtonpost.com.content.txt +23 -0
  32. data/test/articles/20120528_0929_washingtonpost.com.html +3335 -0
  33. data/test/articles/20120528_0931_latimes.com.content.txt +45 -0
  34. data/test/articles/20120528_0931_latimes.com.html +6371 -0
  35. data/test/articles/20120528_0938_entertainment.time.com.content.txt +31 -0
  36. data/test/articles/20120528_0938_entertainment.time.com.html +1261 -0
  37. data/test/articles/20120528_0943_bloomberg.com.content.txt +13 -0
  38. data/test/articles/20120528_0943_bloomberg.com.html +2874 -0
  39. data/test/articles/20120528_0947_reuters.com.content.txt +35 -0
  40. data/test/articles/20120528_0947_reuters.com.html +1563 -0
  41. data/test/articles/20120528_1106_reuters.com.content.txt +5 -0
  42. data/test/articles/20120528_1106_reuters.com.html +551 -0
  43. data/test/articles/20120528_1109_musicthing.blogspot.co.uk.content.txt +19 -0
  44. data/test/articles/20120528_1109_musicthing.blogspot.co.uk.html +865 -0
  45. data/test/articles/20120528_1114_mobileinc.co.uk.content.txt +15 -0
  46. data/test/articles/20120528_1114_mobileinc.co.uk.html +550 -0
  47. data/test/articles/20120528_1119_forbes.com.content.txt +15 -0
  48. data/test/articles/20120528_1119_forbes.com.html +1406 -0
  49. data/test/articles/20120528_1122_techcrunch.com.content.txt +58 -0
  50. data/test/articles/20120528_1122_techcrunch.com.html +1131 -0
  51. data/test/articles/20120528_1126_blogs.adobe.com.content.txt +13 -0
  52. data/test/articles/20120528_1126_blogs.adobe.com.html +303 -0
  53. data/test/articles/20120528_1142_thestar.com.my.content.txt +27 -0
  54. data/test/articles/20120528_1142_thestar.com.my.html +943 -0
  55. data/test/articles/20120528_1146_suntimes.com.content.txt +33 -0
  56. data/test/articles/20120528_1146_suntimes.com.html +5166 -0
  57. data/test/articles/20120528_1148_asiaone.com.content.txt +27 -0
  58. data/test/articles/20120528_1148_asiaone.com.html +1070 -0
  59. data/test/articles/20120529_1120_online.wsj.com.content.txt +56 -0
  60. data/test/articles/20120529_1120_online.wsj.com.html +3035 -0
  61. data/test/articles/20120529_1122_online.wsj.com.content.txt +35 -0
  62. data/test/articles/20120529_1122_online.wsj.com.html +2725 -0
  63. data/test/articles/20120529_1127_smh.com.au.content.txt +13 -0
  64. data/test/articles/20120529_1127_smh.com.au.html +2034 -0
  65. data/test/articles.yml +221 -0
  66. data/test/test_charles.rb +70 -0
  67. metadata +279 -0
data/test/articles.yml ADDED
@@ -0,0 +1,221 @@
1
+ -
2
+ :file: '20120525_1525_straitstimes.com'
3
+ :url: 'http://www.straitstimes.com/BreakingNews/Singapore/Story/STIStory_802924.html'
4
+ :expected:
5
+ :title: 'PAP: Chance to start afresh in Hougang'
6
+ :lede: 'PM Lee, Khaw Boon Wan urge residents to back young candidate'
7
+ :author: 'RACHEL CHANG'
8
+ :image: 'http://www.straitstimes.com/STI/STIMEDIA/image/20120524/ST_IMAGES_RALLYSHARPe.jpg'
9
+ -
10
+ :file: '20120525_1534_bbc.co.uk'
11
+ :url: 'http://www.bbc.co.uk/news/world-africa-18202545'
12
+ :expected:
13
+ :title: 'French President Hollande in Afghanistan on troop visit'
14
+ :lede: 'French President Francois Hollande has arrived in Afghanistan on a previously unannounced visit.'
15
+ :author: nil
16
+ :image: 'http://news.bbcimg.co.uk/media/images/60479000/jpg/_60479902_hollandeafp.jpg'
17
+ :images:
18
+ - 'http://news.bbcimg.co.uk/media/images/58929000/gif/_58929603_afghanistan_troops_464_jan2012.gif'
19
+ -
20
+ :file: '20120525_1727_bbc.co.uk'
21
+ :url: 'http://www.bbc.co.uk/news/health-18190352'
22
+ :expected:
23
+ :title: 'Male pill: gene discovery may lead to contraceptive'
24
+ :lede: 'It may be possible to develop a new male contraceptive pill after researchers in Edinburgh identified a gene critical for the production of healthy sperm.'
25
+ :author: nil
26
+ :image: 'http://news.bbcimg.co.uk/media/images/60456000/jpg/_60456207_c0126522-human_sperm_sem_x2.jpg'
27
+ -
28
+ :file: '20120525_1730_channelnewsasia.com'
29
+ :url: 'http://www.channelnewsasia.com/stories/singaporelocalnews/view/1203419/1/.html'
30
+ :expected:
31
+ :title: 'Cooling-off Day for Hougang By-Election'
32
+ :lede: 'SINGAPORE: It is Cooling-off Day for the Hougang By-Election on Friday.'
33
+ :author: nil
34
+ :image: 'http://www.channelnewsasia.com/components/display_image.php?id=494293'
35
+ -
36
+ :file: '20120525_1733_channelnewsasia.com'
37
+ :url: 'http://www.channelnewsasia.com/stories/singaporebusinessnews/view/1203496/1/.html'
38
+ :expected:
39
+ :title: "Singapore's manufacturing output contracts in April"
40
+ :lede: "SINGAPORE: Singapore's manufacturing output contracted unexpectedly for the second consecutive month in April, dragged down by a drop in production from the biomedical and electronics sectors."
41
+ :author: nil
42
+ :image: 'http://www.channelnewsasia.com/components/display_image.php?id=466095'
43
+ -
44
+ :file: '20120525_1736_nytimes.com'
45
+ :url: 'http://www.nytimes.com/2012/05/26/world/middleeast/egypt-presidential-election-runoff.html?_r=1&ref=global-home#h[]'
46
+ :expected:
47
+ :title: "Muslim Brotherhood Candidate to Face Former Prime Minister in Egyptian Runoff"
48
+ :lede: "CAIRO � The Islamist candidate of the Muslim Brotherhood will face former President Hosni Mubarak�s last prime minister in a runoff to become Egypt�s first freely elected president, several independent vote counts concluded Friday morning."
49
+ :author: 'DAVID D. KIRKPATRICK'
50
+ :image: 'http://graphics8.nytimes.com/images/2012/05/26/global-home/26egypt-image/26egypt-image-articleInline.jpg'
51
+ -
52
+ :file: '20120525_1743_nytimes.com'
53
+ :url: 'http://bits.blogs.nytimes.com/2012/05/24/mark-zuckerberg-officially-a-billionaire/?ref=technology'
54
+ :expected:
55
+ :title: "Mark Zuckerberg Officially a Billionaire"
56
+ :lede: "Mark Zuckerberg, the founder of Facebook, is now officially a billionaire. Until now, Mr. Zuckerberg has been staggeringly rich only on paper."
57
+ :author: 'NICK BILTON'
58
+ :image: 'http://graphics8.nytimes.com/images/2012/05/23/technology/bits-zuck-stock/bits-zuck-stock-tmagArticle.jpg'
59
+ -
60
+ :file: '20120525_1747_techcrunch.com'
61
+ :url: 'http://techcrunch.com/2012/05/24/manpacks-startup-perks-condoms/'
62
+ :expected:
63
+ :title: "Manpacks Launches Its Startup Perks Program With Free Condoms"
64
+ :lede: "Well, that�s one way to get some attention for your product launch."
65
+ :author: 'Anthony Ha'
66
+ :image: 'http://tctechcrunch2011.files.wordpress.com/2012/05/photo-51.jpg?w=288'
67
+ -
68
+ :file: '20120528_0929_washingtonpost.com'
69
+ :url: 'http://www.washingtonpost.com/world/middle_east/un-security-council-blames-syrian-forces-for-shelling-houla-condemns-attacks-on-civilians/2012/05/27/gJQAjoyLvU_story.html'
70
+ :expected:
71
+ :title: "UN Security Council blames Syrian forces for shelling Houla, condemns attacks on civilians"
72
+ :lede: "UNITED NATIONS � The U.N. Security Council on Sunday blamed the Syrian government for attacking residential areas of the town of Houla with artillery and tank shelling and also condemned the close-range killings of civilians there � but avoided saying who was responsible for the massacre of more than 100 men, women and children."
73
+ :author: 'Associated Press'
74
+ :image: nil
75
+ -
76
+ :file: '20120528_0931_latimes.com'
77
+ :url: 'http://www.latimes.com/sports/motorracing/la-sp-indy-500-20120528,0,357526.story'
78
+ :expected:
79
+ :title: "Dario Franchitti is last leader standing, wins his third Indy 500"
80
+ :lede: "Dario Franchitti rebuffs Takuma Sato's frantic last-lap bid to win Indy 500 with a record 34 lead changes. It's a sentimental victory as the race honors the late Dan Wheldon, Franchitti's close friend."
81
+ :author: 'Jim Peltz'
82
+ :image: 'http://www.trbimg.com/img-4fc2ce93/turbine/la-sp-indy-500-20120528-001/600'
83
+ -
84
+ :file: '20120528_0938_entertainment.time.com'
85
+ :url: 'http://entertainment.time.com/2012/05/27/the-palme-damour/?iid=ent-main-lede'
86
+ :expected:
87
+ :title: "The Palme d'Amour: Director Michael Haneke Takes Cannes' Top Prize�Again"
88
+ :lede: "Michael Haneke's quietly magnificent story of love on the brink of death takes the top prize, while seven American films oozing with star quality get no love at all"
89
+ :author: 'Richard Corliss'
90
+ :image: 'http://timeentertainment.files.wordpress.com/2012/05/amour.jpg?w=600&h=400&crop=1'
91
+ -
92
+ :file: '20120528_0943_bloomberg.com'
93
+ :url: 'http://www.bloomberg.com/news/2012-05-28/euro-u-s-equity-futures-advance-on-greece-optimism.html'
94
+ :expected:
95
+ :title: "Euro, U.S. Equity Futures Advance on Greece Optimism"
96
+ :lede: "The euro strengthened for the first time in five days, U.S. equity-index futures advanced and oil gained after Greek opinion polls showed voters backing parties that support the European Union�s bailout, easing concern the country will exit the currency bloc."
97
+ :author: 'Bloomberg News'
98
+ :image: nil
99
+ -
100
+ :file: '20120528_0947_reuters.com'
101
+ :url: 'http://www.reuters.com/article/2012/05/27/us-nuclear-iran-uranium-idUSBRE84O0SN20120527'
102
+ :expected:
103
+ :title: "Iran not ready for visit to suspect nuclear site"
104
+ :lede: "The U.N. nuclear watchdog has not yet given good enough reasons to visit an Iranian site where it suspects there may have been experiments for developing nuclear weapons, Iranian media said."
105
+ :author: 'Marcus George'
106
+ :image: 'http://s1.reutersmedia.net/resources/r/?m=02&d=20120527&t=2&i=612137389&w=&fh=&fw=&ll=700&pl=300&r=CBRE84O17PO00'
107
+ :images:
108
+ - 'http://s1.reutersmedia.net/resources/r/?m=02&d=20120527&t=2&i=612137390&w=&fh=&fw=&ll=700&pl=300&r=CBRE84O17PP00'
109
+ -
110
+ :file: '20120528_1106_reuters.com'
111
+ :url: 'https://www.eff.org/deeplinks/2012/05/megaupload-user-asks-court-files-back-again'
112
+ :expected:
113
+ :title: "Megaupload User Asks Court for Files Back. Again."
114
+ :lede: "You may remember that EFF�s client, Kyle Goodwin, asked the court to return the legal files he lost when Megaupload was seized last January. Since then, we�ve been to court, both for a hearing and a mediation, and nothing has changed. The key problem: the government has failed to help third parties like Kyle get access to their data. So we have no choice but to go back to court."
115
+ :author: 'Julie Samuels'
116
+ :image: nil
117
+ -
118
+ :file: '20120528_1109_musicthing.blogspot.co.uk'
119
+ :url: 'http://musicthing.blogspot.co.uk/2005/05/tiny-music-makers-pt-3-thx-sound.html'
120
+ :expected:
121
+ :title: "TINY MUSIC MAKERS: Pt 3: The THX Sound"
122
+ :lede: '"I like to say that the THX sound is the most widely-recognized piece of computer-generated music in the world," says Andy Moorer. "This may or may not be true, but it sounds cool!"'
123
+ :author: 'Tom Whitwell'
124
+ :image: 'http://photos1.blogger.com/blogger/4749/510/1600/moorer.jpg'
125
+ -
126
+ :file: '20120528_1114_mobileinc.co.uk'
127
+ :url: 'http://www.mobileinc.co.uk/2012/03/ive-started-a-newsletter-the-mobile-inc-digest/'
128
+ :expected:
129
+ :title: "I�ve Started A Newsletter � The Mobile Inc Digest"
130
+ :lede: "I�m spending so much time designing, working and recovering (shoulder surgery ouch) that there�s less and less time to blog right now. However I�m reading and researching more than ever."
131
+ :author: 'Murat'
132
+ :image: nil
133
+ -
134
+ :file: '20120528_1119_forbes.com'
135
+ :url: 'http://www.forbes.com/sites/haydnshaughnessy/2012/05/26/why-crisis-in-spain-this-week-should-have-us-more-worried-than-greece/'
136
+ :expected:
137
+ :title: "Why Crisis in Spain This Week Became More Important Than Greece"
138
+ :lede: "Local debt is the big untold story of the Euro crisis and, if that was not apparent before, it became glaringly so when Catalonia�s President this week told the world his autonomous Catalan Government would struggle to meet its bills at the end of this month."
139
+ :author: 'Haydn Shaughnessy'
140
+ :image: nil
141
+ -
142
+ :file: '20120528_1122_techcrunch.com'
143
+ :url: 'http://techcrunch.com/2012/05/27/hey-kids-get-off-my-lawn-the-once-and-future-visual-programming-environment/'
144
+ :expected:
145
+ :title: "Hey Kids, Get Off My Lawn: The Once And Future Visual Programming Environment"
146
+ :lede: ""
147
+ :author: 'Kwindla Hultman Kramer'
148
+ :image: 'http://tctechcrunch2011.files.wordpress.com/2012/05/seededfertilizedlawn.jpg?w=288'
149
+ -
150
+ :file: '20120528_1126_blogs.adobe.com'
151
+ :url: 'http://blogs.adobe.com/digitalmarketing/personalization/conversion-optimization/understand-the-math-behind-it-all-bayesian-statistics/'
152
+ :expected:
153
+ :title: "Understand the Math Behind it All: Bayesian Statistics"
154
+ :lede: "Most mar�ket�ing peo�ple have only a pass�ing inter�ac�tion with sta�tis�tics, and often times only under�stand it as a mea�sure of how it has impacted their daily life. One of the funny things peo�ple don�t real�ize is that there are two com�pletely dif�fer�ent com�pet�ing schools of thought when it comes to sta�tis�tics. Most peo�ple are famil�iar with fre�quen�tist sta�tis�tics, hav�ing dealt with things like nor�mal dis�tri�b�u�tion, bell curves, and estab�lished prob�a�bil�i�ties. The other school, Bayesian sta�tis�tics, is a realm that fewer peo�ple are famil�iar with, but just as applic�a�ble. In fact, the move over the last few years is for more peo�ple to change from the fre�quen�tist model to Bayesian techniques."
155
+ :author: 'Andrew Anderson'
156
+ :image: nil
157
+ -
158
+ :file: '20120528_1142_thestar.com.my'
159
+ :url: 'http://thestar.com.my/news/story.asp?file=/2012/5/27/nation/11368993&sec=nation'
160
+ :expected:
161
+ :title: "Videogame music writers on tour here at Istana Budaya"
162
+ :lede: "PETALING JAYA: Computer game soundtracks are as recognisable as movie scores these days and have garnered a huge following of fans."
163
+ :author: 'TAN KIT HOONG and CHONG JINN XIUNG'
164
+ :image: 'http://thestar.com.my/archives/2012/5/27/nation/n_10Tallarico.jpg'
165
+ -
166
+ :file: '20120528_1146_suntimes.com'
167
+ :url: 'http://www.suntimes.com/sports/autoracing/12812647-419/dario-franchitti-wins-his-third-indy-500-in-honor-of-late-friend-dan-wheldon.html'
168
+ :expected:
169
+ :title: "Dario Franchitti wins his third Indy 500 in honor of late friend Dan Wheldon"
170
+ :lede: "INDIANAPOLIS � You can debate whether the Indianapolis 500 is the world�s greatest race."
171
+ :author: 'HERB GOULD'
172
+ :image: 'http://www.suntimes.com/csp/cms/sites/dt.common.streams.StreamServer.cls?STREAMOID=G9CD0HBsfM6Ez9hJq$J_vs$daE2N3K4ZzOUsqbU5sYtfyOFkwIFfaZvhenwZq5N7WCsjLu883Ygn4B49Lvm9bPe2QeMKQdVeZmXF$9l$4uCZ8QDXhaHEp3rvzXRJFdy0KqPHLoMevcTLo3h8xh70Y6N_U_CryOsw6FTOdKL_jpQ-&CONTENTTYPE=image/jpeg'
173
+ -
174
+ :file: '20120528_1148_asiaone.com'
175
+ :url: 'http://news.asiaone.com/News/Latest%2BNews/Singapore/Story/A1Story20120528-348743.html'
176
+ :expected:
177
+ :title: "Choo: Back on the ground today"
178
+ :lede: "He may not have been successful in the Hougang by-election but, for People's Action Party's (PAP's) Desmond Choo, it is back to work today, helping residents on the ground."
179
+ :author: 'Gwendolyn Ng'
180
+ :image: 'http://news.asiaone.com/A1MEDIA/news/05May12/images/20120528.083508_dchood.jpg'
181
+ -
182
+ :file: '20120529_1120_online.wsj.com'
183
+ :url: 'http://online.wsj.com/article/SB10001424052702303674004577433160886451978.html'
184
+ :expected:
185
+ :title: "Storied Law Firm Dewey Files Chapter 11"
186
+ :lede: "The embattled New York law firm Dewey & LeBoeuf LLP has filed for bankruptcy protection, a move that effectively ends what had been at its height a 1,300-lawyer global enterprise and marks one of the largest law-firm failures in U.S. history."
187
+ :author: 'JENNIFER SMITH and ASHBY JONES'
188
+ :image: 'http://si.wsj.net/public/resources/images/OB-SV369_dewey0_D_20120503161002.jpg'
189
+ -
190
+ :file: '20120529_1122_online.wsj.com'
191
+ :url: 'http://online.wsj.com/article/SB10001424052702303674004577432412894441148.html?mod=WSJ_Tech_Europe_INTL_LSMODULE'
192
+ :expected:
193
+ :title: "Advanced Malware Targets Middle East"
194
+ :lede: 'Computer malware described as "the most sophisticated cyberweapon yet unleashed" has been uncovered in computers in the Middle East and may have infected machines in Europe, according to reports from antivirus researchers and software makers in Russia, Hungary and Ireland.'
195
+ :author: nil
196
+ :image: nil
197
+ -
198
+ :file: '20120529_1127_smh.com.au'
199
+ :url: 'http://www.smh.com.au/digital-life/digital-life-news/mark-zuckerberg-makes-surprise-cameo-on-chinese-tv-20120529-1zfw2.html'
200
+ :expected:
201
+ :title: "Mark Zuckerberg makes surprise cameo on Chinese TV"
202
+ :lede: "BEIJING � Social media sites and blogs have lit up after eagle-eyed viewers spotted a surprise cameo in a Chinese TV documentary about the country's police force: Facebook founder Mark Zuckerberg and his now-wife, Priscilla Chan."
203
+ :author: nil
204
+ :image: 'http://images.smh.com.au/2012/05/29/3333176/art_zuckerbergs-420x0.jpg'
205
+ -
206
+ :file: ''
207
+ :url: ''
208
+ :expected:
209
+ :title: ""
210
+ :lede: ""
211
+ :author: ''
212
+ :image: ''
213
+ -
214
+ :file: ''
215
+ :url: ''
216
+ :expected:
217
+ :title: ""
218
+ :lede: ""
219
+ :author: ''
220
+ :image: ''
221
+
@@ -0,0 +1,70 @@
1
+ require 'test/unit'
2
+ require 'charles'
3
+ require 'yaml'
4
+ #require 'active_support/testing/assertions'
5
+
6
+ TEST_ARTICLES = YAML.load_file("test/articles.yml")
7
+
8
+ Charles.options[:tmp_path] = File.dirname(__FILE__) + "/tmp"
9
+
10
+ class CharlesTest < Test::Unit::TestCase
11
+ #include ActiveSupport::Testing::Assertions
12
+
13
+ def setup
14
+ end
15
+
16
+ def test_articles
17
+ _scores = {:content => [], :title => [], :image => []}
18
+ TEST_ARTICLES.each{|article|
19
+ next if article[:file].empty?
20
+ input = File.read("test/articles/#{article[:file]}.html")
21
+ document = Charles::Document.new(input, :url => article[:url])
22
+ result = document.content
23
+ expected = File.read("test/articles/#{article[:file]}.content.txt")
24
+ content_score = Charles::Misc.compare_strings(result, expected)
25
+ #pp [content_score, result, expected, article[:url]] if content_score < 0.1
26
+ _scores[:content] << content_score
27
+ title_score = Charles::Misc.compare_strings(document.title, article[:expected][:title])
28
+ #pp [title_score, document.title, article[:expected][:title], article[:url]] if title_score < 0.1
29
+ _scores[:title] << title_score
30
+
31
+ if article[:expected][:image]
32
+ _scores[:image] << (document.images.index(article[:expected][:image]) ? 1 : 0)
33
+ end
34
+ }
35
+
36
+ assert _scores[:content].select{|score| score < 0.5}.mean > 0.2
37
+ assert _scores[:content].select{|score| score < 0.1}.size < 4
38
+ assert _scores[:content].select{|score| score < 0.01}.size < 1
39
+ assert _scores[:title].select{|score| score < 0.5}.mean > 0.15
40
+ assert _scores[:title].select{|score| score < 0.1}.size < 5
41
+ assert _scores[:title].select{|score| score < 0.01}.size < 2
42
+ assert _scores[:image].mean > 0.4
43
+ end
44
+
45
+ def test_clean_title
46
+ article = TEST_ARTICLES.detect{|article| article[:url] == 'http://online.wsj.com/article/SB10001424052702303674004577433160886451978.html'}
47
+ input = File.read("test/articles/#{article[:file]}.html")
48
+ sample_titles = ['Former ML closer Armando Benitez signs with Ducks - WSJ.com',
49
+ 'The Top 10 Clean-Tech Companies - WSJ.com',
50
+ 'Book Review: Internal Time - WSJ.com',
51
+ 'NASA Working With Private Sector � Letters to the Editor - WSJ.com',
52
+ 'NASA Working With Private Sector � Letters to the Editor - WSJ.com',
53
+ 'San Francisco Symphony Orchestra | Radicals Ready for the Road - WSJ.com']
54
+ document = Charles::Document.new(input, :url => article[:url], :sample_titles => sample_titles)
55
+ assert document.title.include?('WSJ.com')
56
+ assert !document.clean_title.include?('WSJ.com')
57
+ end
58
+
59
+ def test_filtered_images
60
+ article = TEST_ARTICLES.detect{|article| article[:url] == 'http://online.wsj.com/article/SB10001424052702303674004577433160886451978.html'}
61
+ input = File.read("test/articles/#{article[:file]}.html")
62
+ document = Charles::Document.new(input, :url => article[:url])
63
+ assert document.filtered_images.size > 3
64
+ assert document.filtered_images.last[:data].size > 1000
65
+ assert document.filtered_images.last[:width] > 100
66
+ assert document.filtered_images.last[:height] > 100
67
+ end
68
+
69
+
70
+ end
metadata ADDED
@@ -0,0 +1,279 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: charles
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - Jason Ling Xiaowei
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2012-05-30 00:00:00 Z
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: ferret
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ hash: 3
29
+ segments:
30
+ - 0
31
+ version: "0"
32
+ type: :runtime
33
+ version_requirements: *id001
34
+ - !ruby/object:Gem::Dependency
35
+ name: nokogiri
36
+ prerelease: false
37
+ requirement: &id002 !ruby/object:Gem::Requirement
38
+ none: false
39
+ requirements:
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ hash: 3
43
+ segments:
44
+ - 0
45
+ version: "0"
46
+ type: :runtime
47
+ version_requirements: *id002
48
+ - !ruby/object:Gem::Dependency
49
+ name: htmlentities
50
+ prerelease: false
51
+ requirement: &id003 !ruby/object:Gem::Requirement
52
+ none: false
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ hash: 3
57
+ segments:
58
+ - 0
59
+ version: "0"
60
+ type: :runtime
61
+ version_requirements: *id003
62
+ - !ruby/object:Gem::Dependency
63
+ name: mechanize
64
+ prerelease: false
65
+ requirement: &id004 !ruby/object:Gem::Requirement
66
+ none: false
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ hash: 3
71
+ segments:
72
+ - 0
73
+ version: "0"
74
+ type: :runtime
75
+ version_requirements: *id004
76
+ - !ruby/object:Gem::Dependency
77
+ name: activesupport
78
+ prerelease: false
79
+ requirement: &id005 !ruby/object:Gem::Requirement
80
+ none: false
81
+ requirements:
82
+ - - ">="
83
+ - !ruby/object:Gem::Version
84
+ hash: 3
85
+ segments:
86
+ - 0
87
+ version: "0"
88
+ type: :runtime
89
+ version_requirements: *id005
90
+ - !ruby/object:Gem::Dependency
91
+ name: rack
92
+ prerelease: false
93
+ requirement: &id006 !ruby/object:Gem::Requirement
94
+ none: false
95
+ requirements:
96
+ - - ">="
97
+ - !ruby/object:Gem::Version
98
+ hash: 3
99
+ segments:
100
+ - 0
101
+ version: "0"
102
+ type: :runtime
103
+ version_requirements: *id006
104
+ - !ruby/object:Gem::Dependency
105
+ name: imagesize
106
+ prerelease: false
107
+ requirement: &id007 !ruby/object:Gem::Requirement
108
+ none: false
109
+ requirements:
110
+ - - ">="
111
+ - !ruby/object:Gem::Version
112
+ hash: 3
113
+ segments:
114
+ - 0
115
+ version: "0"
116
+ type: :runtime
117
+ version_requirements: *id007
118
+ description: Charles the Content Extractor
119
+ email:
120
+ - jason@jeyel.com
121
+ executables:
122
+ - charles
123
+ extensions: []
124
+
125
+ extra_rdoc_files: []
126
+
127
+ files:
128
+ - .gitignore
129
+ - Gemfile
130
+ - LICENSE
131
+ - README.md
132
+ - Rakefile
133
+ - bin/charles
134
+ - charles.gemspec
135
+ - lib/charles.rb
136
+ - lib/charles/document.rb
137
+ - lib/charles/images.rb
138
+ - lib/charles/internal_attributes.rb
139
+ - lib/charles/misc.rb
140
+ - lib/charles/version.rb
141
+ - optimise.rb
142
+ - test/articles.yml
143
+ - test/articles/20120525_1525_straitstimes.com.content.txt
144
+ - test/articles/20120525_1525_straitstimes.com.html
145
+ - test/articles/20120525_1534_bbc.co.uk.content.txt
146
+ - test/articles/20120525_1534_bbc.co.uk.html
147
+ - test/articles/20120525_1727_bbc.co.uk.content.txt
148
+ - test/articles/20120525_1727_bbc.co.uk.html
149
+ - test/articles/20120525_1730_channelnewsasia.com.content.txt
150
+ - test/articles/20120525_1730_channelnewsasia.com.html
151
+ - test/articles/20120525_1733_channelnewsasia.com.content.txt
152
+ - test/articles/20120525_1733_channelnewsasia.com.html
153
+ - test/articles/20120525_1736_nytimes.com.content.txt
154
+ - test/articles/20120525_1736_nytimes.com.html
155
+ - test/articles/20120525_1743_nytimes.com.content.txt
156
+ - test/articles/20120525_1743_nytimes.com.html
157
+ - test/articles/20120525_1747_techcrunch.com.content.txt
158
+ - test/articles/20120525_1747_techcrunch.com.html
159
+ - test/articles/20120528_0929_washingtonpost.com.content.txt
160
+ - test/articles/20120528_0929_washingtonpost.com.html
161
+ - test/articles/20120528_0931_latimes.com.content.txt
162
+ - test/articles/20120528_0931_latimes.com.html
163
+ - test/articles/20120528_0938_entertainment.time.com.content.txt
164
+ - test/articles/20120528_0938_entertainment.time.com.html
165
+ - test/articles/20120528_0943_bloomberg.com.content.txt
166
+ - test/articles/20120528_0943_bloomberg.com.html
167
+ - test/articles/20120528_0947_reuters.com.content.txt
168
+ - test/articles/20120528_0947_reuters.com.html
169
+ - test/articles/20120528_1106_reuters.com.content.txt
170
+ - test/articles/20120528_1106_reuters.com.html
171
+ - test/articles/20120528_1109_musicthing.blogspot.co.uk.content.txt
172
+ - test/articles/20120528_1109_musicthing.blogspot.co.uk.html
173
+ - test/articles/20120528_1114_mobileinc.co.uk.content.txt
174
+ - test/articles/20120528_1114_mobileinc.co.uk.html
175
+ - test/articles/20120528_1119_forbes.com.content.txt
176
+ - test/articles/20120528_1119_forbes.com.html
177
+ - test/articles/20120528_1122_techcrunch.com.content.txt
178
+ - test/articles/20120528_1122_techcrunch.com.html
179
+ - test/articles/20120528_1126_blogs.adobe.com.content.txt
180
+ - test/articles/20120528_1126_blogs.adobe.com.html
181
+ - test/articles/20120528_1142_thestar.com.my.content.txt
182
+ - test/articles/20120528_1142_thestar.com.my.html
183
+ - test/articles/20120528_1146_suntimes.com.content.txt
184
+ - test/articles/20120528_1146_suntimes.com.html
185
+ - test/articles/20120528_1148_asiaone.com.content.txt
186
+ - test/articles/20120528_1148_asiaone.com.html
187
+ - test/articles/20120529_1120_online.wsj.com.content.txt
188
+ - test/articles/20120529_1120_online.wsj.com.html
189
+ - test/articles/20120529_1122_online.wsj.com.content.txt
190
+ - test/articles/20120529_1122_online.wsj.com.html
191
+ - test/articles/20120529_1127_smh.com.au.content.txt
192
+ - test/articles/20120529_1127_smh.com.au.html
193
+ - test/test_charles.rb
194
+ homepage: https://github.com/jlxw/charles
195
+ licenses: []
196
+
197
+ post_install_message:
198
+ rdoc_options: []
199
+
200
+ require_paths:
201
+ - lib
202
+ required_ruby_version: !ruby/object:Gem::Requirement
203
+ none: false
204
+ requirements:
205
+ - - ">="
206
+ - !ruby/object:Gem::Version
207
+ hash: 3
208
+ segments:
209
+ - 0
210
+ version: "0"
211
+ required_rubygems_version: !ruby/object:Gem::Requirement
212
+ none: false
213
+ requirements:
214
+ - - ">="
215
+ - !ruby/object:Gem::Version
216
+ hash: 3
217
+ segments:
218
+ - 0
219
+ version: "0"
220
+ requirements: []
221
+
222
+ rubyforge_project:
223
+ rubygems_version: 1.8.13
224
+ signing_key:
225
+ specification_version: 3
226
+ summary: Charles the Content Extractor
227
+ test_files:
228
+ - test/articles.yml
229
+ - test/articles/20120525_1525_straitstimes.com.content.txt
230
+ - test/articles/20120525_1525_straitstimes.com.html
231
+ - test/articles/20120525_1534_bbc.co.uk.content.txt
232
+ - test/articles/20120525_1534_bbc.co.uk.html
233
+ - test/articles/20120525_1727_bbc.co.uk.content.txt
234
+ - test/articles/20120525_1727_bbc.co.uk.html
235
+ - test/articles/20120525_1730_channelnewsasia.com.content.txt
236
+ - test/articles/20120525_1730_channelnewsasia.com.html
237
+ - test/articles/20120525_1733_channelnewsasia.com.content.txt
238
+ - test/articles/20120525_1733_channelnewsasia.com.html
239
+ - test/articles/20120525_1736_nytimes.com.content.txt
240
+ - test/articles/20120525_1736_nytimes.com.html
241
+ - test/articles/20120525_1743_nytimes.com.content.txt
242
+ - test/articles/20120525_1743_nytimes.com.html
243
+ - test/articles/20120525_1747_techcrunch.com.content.txt
244
+ - test/articles/20120525_1747_techcrunch.com.html
245
+ - test/articles/20120528_0929_washingtonpost.com.content.txt
246
+ - test/articles/20120528_0929_washingtonpost.com.html
247
+ - test/articles/20120528_0931_latimes.com.content.txt
248
+ - test/articles/20120528_0931_latimes.com.html
249
+ - test/articles/20120528_0938_entertainment.time.com.content.txt
250
+ - test/articles/20120528_0938_entertainment.time.com.html
251
+ - test/articles/20120528_0943_bloomberg.com.content.txt
252
+ - test/articles/20120528_0943_bloomberg.com.html
253
+ - test/articles/20120528_0947_reuters.com.content.txt
254
+ - test/articles/20120528_0947_reuters.com.html
255
+ - test/articles/20120528_1106_reuters.com.content.txt
256
+ - test/articles/20120528_1106_reuters.com.html
257
+ - test/articles/20120528_1109_musicthing.blogspot.co.uk.content.txt
258
+ - test/articles/20120528_1109_musicthing.blogspot.co.uk.html
259
+ - test/articles/20120528_1114_mobileinc.co.uk.content.txt
260
+ - test/articles/20120528_1114_mobileinc.co.uk.html
261
+ - test/articles/20120528_1119_forbes.com.content.txt
262
+ - test/articles/20120528_1119_forbes.com.html
263
+ - test/articles/20120528_1122_techcrunch.com.content.txt
264
+ - test/articles/20120528_1122_techcrunch.com.html
265
+ - test/articles/20120528_1126_blogs.adobe.com.content.txt
266
+ - test/articles/20120528_1126_blogs.adobe.com.html
267
+ - test/articles/20120528_1142_thestar.com.my.content.txt
268
+ - test/articles/20120528_1142_thestar.com.my.html
269
+ - test/articles/20120528_1146_suntimes.com.content.txt
270
+ - test/articles/20120528_1146_suntimes.com.html
271
+ - test/articles/20120528_1148_asiaone.com.content.txt
272
+ - test/articles/20120528_1148_asiaone.com.html
273
+ - test/articles/20120529_1120_online.wsj.com.content.txt
274
+ - test/articles/20120529_1120_online.wsj.com.html
275
+ - test/articles/20120529_1122_online.wsj.com.content.txt
276
+ - test/articles/20120529_1122_online.wsj.com.html
277
+ - test/articles/20120529_1127_smh.com.au.content.txt
278
+ - test/articles/20120529_1127_smh.com.au.html
279
+ - test/test_charles.rb