cyx-scraper 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Cyril David
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.markdown ADDED
@@ -0,0 +1,46 @@
1
+ Scraper Library
2
+ ===============
3
+
4
+ Objectives
5
+ ----------
6
+ To provide a generic ruby gem which easily facilitates the scraping of various sites. The following lists all the types of webpages that will be targeted by this libary:
7
+
8
+ 1. Youtube.com
9
+ 2. Wikipedia.org
10
+ 3. Vimeo.com
11
+ 4. Flickr.com
12
+ 5. Any blog, article, news, etc.
13
+
14
+ Extracting information from Youtube or vimeo
15
+ --------------------------------------------
16
+
17
+ For youtube and vimeo, the following sample code best describes what you can expect:
18
+
19
+ @scraper = Scraper( :url => "http://www.youtube.com/watch?v=MDhMBxAHGYE" )
20
+ # => #<Scraper::Youtube>
21
+
22
+ @scraper.thumbnail
23
+ # => "http://i.ytimg.com/vi/MDhMBxAHGYE/2.jpg"
24
+
25
+ @scraper.title
26
+ # => "Rick Roll [Geek Edition]"
27
+
28
+ @scraper.html
29
+ # => "<object width="425" height="344"><param name="movie" value="http://www.youtube.com/v/MDhMBxAHGYE&hl=en&fs=1&"></param><param name="allowFullScreen" value="true"></param><param name="allowscriptaccess" value="always"></param><embed src="http://www.youtube.com/v/MDhMBxAHGYE&hl=en&fs=1&" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="425" height="344"></embed></object>"
30
+
31
+ Extracting content from blogs, news articles, and beyond
32
+ --------------------------------------------------------
33
+
34
+ When a url from a webpage that isn't part of the special group (movies, photos, and other multimedia), the content portion of the page is extracted from that url using a relevancy scoring algorithm.
35
+
36
+ Example:
37
+
38
+ @scraper = Scraper( :url => "http://www.alistapart.com/articles/unwebbable")
39
+ # => #<Scraper::Article>
40
+
41
+ @scraper.title
42
+ # => "A List Apart: Articles: Unwebbable"
43
+
44
+ @scraper.text
45
+ # => "It's time we came to grips with the fact that not every "document" can be a web page." ...
46
+
data/Rakefile ADDED
@@ -0,0 +1,56 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "scraper"
8
+ gem.summary = %Q{TODO}
9
+ gem.email = "cyx.ucron@gmail.com"
10
+ gem.homepage = "http://github.com/cyx/scraper"
11
+ gem.authors = ["Cyril David"]
12
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
13
+ end
14
+
15
+ rescue LoadError
16
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
17
+ end
18
+
19
+ require 'rake/testtask'
20
+ Rake::TestTask.new(:test) do |test|
21
+ test.libs << 'lib' << 'test'
22
+ test.pattern = 'test/**/*_test.rb'
23
+ test.verbose = true
24
+ end
25
+
26
+ begin
27
+ require 'rcov/rcovtask'
28
+ Rcov::RcovTask.new do |test|
29
+ test.libs << 'test'
30
+ test.pattern = 'test/**/*_test.rb'
31
+ test.verbose = true
32
+ end
33
+ rescue LoadError
34
+ task :rcov do
35
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
36
+ end
37
+ end
38
+
39
+
40
+ task :default => :test
41
+
42
+ require 'rake/rdoctask'
43
+ Rake::RDocTask.new do |rdoc|
44
+ if File.exist?('VERSION.yml')
45
+ config = YAML.load(File.read('VERSION.yml'))
46
+ version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
47
+ else
48
+ version = ""
49
+ end
50
+
51
+ rdoc.rdoc_dir = 'rdoc'
52
+ rdoc.title = "scraper #{version}"
53
+ rdoc.rdoc_files.include('README*')
54
+ rdoc.rdoc_files.include('lib/**/*.rb')
55
+ end
56
+
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.2.0
@@ -0,0 +1,168 @@
1
+ # Copyright (c) 2009 [Cyril David]
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining
4
+ # a copy of this software and associated documentation files (the
5
+ # "Software"), to deal in the Software without restriction, including
6
+ # without limitation the rights to use, copy, modify, merge, publish,
7
+ # distribute, sublicense, and/or sell copies of the Software, and to
8
+ # permit persons to whom the Software is furnished to do so, subject to
9
+ # the following conditions:
10
+ #
11
+ # The above copyright notice and this permission notice shall be
12
+ # included in all copies or substantial portions of the Software.
13
+ #
14
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21
+
22
+ require 'nokogiri'
23
+ require 'open-uri'
24
+
25
+ module Scraper
26
+ class Article
27
+ class Unsupported < StandardError; end
28
+
29
+ BAD_CLASS_NAMES = /(comment|meta|footer|footnote)/
30
+ GOOD_CLASS_NAMES = /((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\\s|$))/
31
+
32
+ BAD_ID_NAMES = /(comment|meta|footer|footnote)/
33
+ GOOD_ID_NAMES = /^(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)$/
34
+ attr_reader :title
35
+
36
+ def self.=~( args )
37
+ return true
38
+ end
39
+
40
+ # Usage:
41
+ # ======
42
+ #
43
+ # require 'open-uri'
44
+ # @resource = open("http://tinyurl.com/ys9wt")
45
+ # @article = Scraper::Article.new(@resource.read)
46
+ # @article.title
47
+ # => "Open Source Initiative OSI - The MIT License:Licensing ..."
48
+ #
49
+ # @article.text
50
+ # => "The MIT License\nCopyright (c) <year> <copyright holders> ..."
51
+ #
52
+ # @article.html
53
+ # => "<img width=\"100\" height=\"137\" alt=\"[OSI Approved ..."
54
+ #
55
+ # In some cases, this might raise an Unsupported error. Submit an issue
56
+ # at http://github.com/cyx/scraper/issues in that case.
57
+ #
58
+ def initialize( args = {} )
59
+ content = extract_from_args( args )
60
+ @document = Nokogiri::HTML replace_double_brs_and_fonts(content)
61
+ @title = @document.search('title').first.content
62
+ @top_div = calculate_top_div @document.search('p')
63
+
64
+ unless @top_div
65
+ raise Unsupported, "The content is unsupported at this time"
66
+ end
67
+
68
+ clean!( @top_div )
69
+ end
70
+
71
+ def text
72
+ @top_div.content.strip
73
+ end
74
+
75
+ def html
76
+ @top_div.inner_html
77
+ end
78
+
79
+ private
80
+ def extract_from_args(args)
81
+ if args[:content]
82
+ return args[:content]
83
+ elsif args[:url]
84
+ open(args[:url]).read
85
+ else
86
+ raise ArgumentError, "Scraper::Article#initialize only accepts content or url as its argument options"
87
+ end
88
+ end
89
+
90
+ def clean!( node )
91
+ clean_styles!(node)
92
+ kill_divs!(node)
93
+ clean_tags!(node, "form")
94
+ clean_tags!(node, "object")
95
+ clean_tags!(node, "table")
96
+ clean_tags!(node, "h1")
97
+ clean_tags!(node, "h2")
98
+ clean_tags!(node, "iframe")
99
+ end
100
+
101
+ def calculate_top_div( paragraphs )
102
+ scores = rate_and_score_paragraphs( paragraphs )
103
+ scores.sort_by { |e| e[:score] }.last[:node]
104
+ end
105
+
106
+ def rate_and_score_paragraphs( paragraphs )
107
+ paragraphs.map do |paragraph|
108
+ rating = { :node => paragraph.parent, :score => 0 }
109
+
110
+ if rating[:node].attribute('class').to_s.match(BAD_CLASS_NAMES)
111
+ rating[:score] -= 50
112
+ elsif rating[:node].attribute('class').to_s.match(GOOD_CLASS_NAMES)
113
+ rating[:score] += 25
114
+ end
115
+
116
+ if rating[:node].attribute('id').to_s.match(BAD_ID_NAMES)
117
+ rating[:score] -= 50
118
+ elsif rating[:node].attribute('id').to_s.match(GOOD_ID_NAMES)
119
+ rating[:score] += 25
120
+ end
121
+
122
+ if paragraph.content.length > 10
123
+ rating[:score] += 1
124
+ end
125
+
126
+ rating[:score] += get_char_count(rating[:node])
127
+ rating
128
+ end
129
+ end
130
+
131
+ def replace_double_brs_and_fonts( content )
132
+ pattern = /<br\/?>[ \r\n\s]*<br\/?>/
133
+ content.gsub(pattern, '</p><p>').gsub(/<\/?font[^>]*>/, '')
134
+ end
135
+
136
+ def get_char_count( node, char = ',' )
137
+ node.content.split(char).length
138
+ end
139
+
140
+ def clean_styles!( node )
141
+ node.search('*').remove_attr('style')
142
+ end
143
+
144
+ def kill_divs!( node )
145
+ node.search('div').each do |div|
146
+ p = div.search('p').length
147
+ img = div.search('img').length
148
+ li = div.search('li').length
149
+ a = div.search('a').length
150
+ embed = div.search('embed').length
151
+
152
+ if get_char_count( div ) < 10
153
+ if img > p || li > p || a > p || p == 0 || embed > 0
154
+ div.remove
155
+ end
156
+ end
157
+ end
158
+ end
159
+
160
+ def clean_tags!(node, tags, min_words = 1000000)
161
+ node.search(tags).each do |target|
162
+ if get_char_count( target, " " ) < min_words
163
+ target.remove
164
+ end
165
+ end
166
+ end
167
+ end
168
+ end
@@ -0,0 +1,97 @@
1
+ # Copyright (c) 2009 [Cyril David]
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining
4
+ # a copy of this software and associated documentation files (the
5
+ # "Software"), to deal in the Software without restriction, including
6
+ # without limitation the rights to use, copy, modify, merge, publish,
7
+ # distribute, sublicense, and/or sell copies of the Software, and to
8
+ # permit persons to whom the Software is furnished to do so, subject to
9
+ # the following conditions:
10
+ #
11
+ # The above copyright notice and this permission notice shall be
12
+ # included in all copies or substantial portions of the Software.
13
+ #
14
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21
+
22
+ require 'uri'
23
+ require 'builder'
24
+
25
+ module Scraper
26
+ class Youtube
27
+ VALID_HOST_NAME = /\A([a-z]+\.)?youtube\.com\z/
28
+ VIDEO_ID_MATCHER = /([^&]+&)?v=([^&]+)/
29
+
30
+ WIDTH = 325
31
+ HEIGHT = 244
32
+ ALLOW_FULL_SCREEN = true
33
+ MIME_TYPE = 'application/x-shockwave-flash'
34
+
35
+ attr_reader :video_id
36
+
37
+ class << self
38
+ def =~( args )
39
+ if args[:url]
40
+ uri = URI.parse( args[:url] )
41
+
42
+ if valid_host_name?( uri.host )
43
+ return true
44
+ end
45
+ end
46
+ end
47
+
48
+ def valid_host_name?( host_name )
49
+ host_name.match(VALID_HOST_NAME)
50
+ end
51
+ end
52
+
53
+ def initialize( args = {} )
54
+ uri = URI.parse(args[:url])
55
+
56
+ unless self.class.valid_host_name?(uri.host)
57
+ raise ArgumentError, "URL must be from youtube.com"
58
+ end
59
+
60
+ unless @video_id = extract_video_id_from_query_string( uri.query )
61
+ raise ArgumentError, "URL must have a video ID in it"
62
+ end
63
+ end
64
+
65
+ def html( args = {} )
66
+ w, h = args[:width] || WIDTH, args[:height] || HEIGHT
67
+
68
+ xml = Builder::XmlMarkup.new
69
+ xml.object(:width => w, :height => h) do |object|
70
+ object.param :name => 'movie', :value => movie_url
71
+ object.param :name => 'allowFullScreen', :value => ALLOW_FULL_SCREEN
72
+ object.param :name => 'allowscriptaccess', :value => 'always'
73
+ object.embed :src => movie_url,
74
+ :type => MIME_TYPE,
75
+ :allowscriptaccess => 'always',
76
+ :allowfullscreen => ALLOW_FULL_SCREEN,
77
+ :width => w,
78
+ :height => h
79
+ end
80
+ end
81
+
82
+ def thumbnail
83
+ "http://i.ytimg.com/vi/#{movie_id}/2.jpg"
84
+ end
85
+
86
+ private
87
+ def movie_url
88
+ :"http://www.youtube.com/v/#{video_id}&hl=en&fs=1"
89
+ end
90
+
91
+ def extract_video_id_from_query_string( query_string )
92
+ if matches = query_string.match(VIDEO_ID_MATCHER)
93
+ matches[2]
94
+ end
95
+ end
96
+ end
97
+ end
data/lib/scraper.rb ADDED
@@ -0,0 +1,16 @@
1
+ module Scraper
2
+ autoload :Article, 'scraper/article'
3
+ autoload :Youtube, 'scraper/youtube'
4
+
5
+ HANDLERS = [ :Youtube, :Article ]
6
+ end
7
+
8
+ def Scraper( args = {} )
9
+ if handler = Scraper::HANDLERS.detect { |h| Scraper.const_get(h) =~ args }
10
+ Scraper.const_get( handler ).new( args )
11
+ else
12
+ raise ArgumentError, "Scraper cannot handle content from #{args}"
13
+ end
14
+ end
15
+
16
+ $LOAD_PATH.unshift( File.dirname(__FILE__) )
data/scraper.gemspec ADDED
@@ -0,0 +1,54 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = %q{scraper}
5
+ s.version = "0.2.0"
6
+
7
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
+ s.authors = ["Cyril David"]
9
+ s.date = %q{2009-07-31}
10
+ s.email = %q{cyx.ucron@gmail.com}
11
+ s.extra_rdoc_files = [
12
+ "LICENSE",
13
+ "README.markdown"
14
+ ]
15
+ s.files = [
16
+ ".document",
17
+ ".gitignore",
18
+ "LICENSE",
19
+ "README.markdown",
20
+ "Rakefile",
21
+ "VERSION",
22
+ "lib/scraper.rb",
23
+ "lib/scraper/article.rb",
24
+ "lib/scraper/youtube.rb",
25
+ "scraper.gemspec",
26
+ "test/article_test.rb",
27
+ "test/fixtures/scraped.html",
28
+ "test/fixtures/unwebbable.html",
29
+ "test/scraper_test.rb",
30
+ "test/test_helper.rb",
31
+ "test/youtube_test.rb"
32
+ ]
33
+ s.homepage = %q{http://github.com/cyx/scraper}
34
+ s.rdoc_options = ["--charset=UTF-8"]
35
+ s.require_paths = ["lib"]
36
+ s.rubygems_version = %q{1.3.3}
37
+ s.summary = %q{TODO}
38
+ s.test_files = [
39
+ "test/article_test.rb",
40
+ "test/scraper_test.rb",
41
+ "test/test_helper.rb",
42
+ "test/youtube_test.rb"
43
+ ]
44
+
45
+ if s.respond_to? :specification_version then
46
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
47
+ s.specification_version = 3
48
+
49
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
50
+ else
51
+ end
52
+ else
53
+ end
54
+ end
@@ -0,0 +1,34 @@
1
+ require 'test_helper'
2
+
3
+ class Scraper::ArticleTest < Test::Unit::TestCase
4
+ context "given the unwebbable A-List-Apart article" do
5
+ setup do
6
+ @fixture = fixture_file('unwebbable.html')
7
+ @article = Scraper::Article.new( :content => @fixture )
8
+ end
9
+
10
+ should "not raise an error during initialization" do
11
+ assert_nothing_raised do
12
+ @article = Scraper::Article.new( :content => @fixture )
13
+ end
14
+ end
15
+
16
+ should "have a title: A List Apart: Articles: Unwebbable" do
17
+ assert_equal 'A List Apart: Articles: Unwebbable',
18
+ @article.title
19
+ end
20
+
21
+ should "have a content body starting with It's time we came to grips" do
22
+ assert_match(/It’s time we came to grips/m, @article.text)
23
+ end
24
+
25
+ should "have a content ending with XML is finally a viable option." do
26
+ assert_match(/XML is finally a viable option.$/, @article.text)
27
+ end
28
+
29
+ should "have the html content in scraped" do
30
+ assert_equal fixture_file('scraped.html'),
31
+ @article.html
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,150 @@
1
+
2
+
3
+
4
+ <p>
5
+ <strong>It’s time we came to grips with the fact that not every “document” can be a “web page.” Some forms of writing just cannot be expressed in HTML—or they need to be bent and distorted to do so. But for once, XML might actually help.
6
+ </strong></p>
7
+
8
+ <p>
9
+ The creation myth of the web tells us that Tim Berners-Lee invented HTML as a means of publishing physics research papers. True? It doesn’t matter; it’s a founding legend of the web whose legacy continues to this day. You can gin up as many web applications as you want, but the web is mostly still a place to publish <em>documents</em>.
10
+ </p>
11
+
12
+ <p>
13
+ The web is replete with projects to “digitize legacy content”—patent applications, books, photographs, everything. While photographs might survive well as JPEGs or TIFFs (disregarding accessibility issues for a moment), the bulk of this legacy content requires semantic markup for computers to understand it. A sheet of paper provides complete authorial freedom, but that freedom can translate poorly to the coarse semantics of HTML. The digitization craze—that’s what it is—crashes headlong into HTML semantics.
14
+ </p>
15
+
16
+ <p>
17
+ Some documents cannot be published using HTML. In many cases, we shouldn’t even bother trying. In other cases, we have to radically change the appearance and structure of the document. Ideally, we’ll start using custom XML document types—which, finally and at long last, might actually work.
18
+ </p>
19
+
20
+
21
+
22
+ <p>
23
+ An example of the conundrum of transferring print documents to the web, one that has become legendary in some circles, is the film screenplay.
24
+ </p>
25
+
26
+ <p>
27
+ A lot of people want to write a screenplay. The outcomes for most of these writers are the same: Nobody films and releases their movie. And they all go through the same phase—learning the generations-old “style” of screenplay formatting.
28
+ </p>
29
+
30
+ <div class="illustration full left">
31
+ <img src="/d/unwebbable/die-hard-script.jpg" alt="Screenplay"><p>Typewritten screenplay from <cite><a href="http://www.dailyscript.com/scripts/Die_Hard_2.pdf">Die Hard 2</a></cite>.</p>
32
+ </div>
33
+
34
+ <p>
35
+ Originating in the typewriter age, <a href="http://www.flickr.com/photos/joeclark/collections/72157621289042508/">screenplay layouts</a> are custom-engineered so that one printed page (in what we now call U.S. letter size) equals almost exactly one minute of onscreen time. Since most commercial movies run about two hours in length, typical Hollywood movie scripts are 118 to 122 pages long.
36
+ </p>
37
+
38
+ <p>
39
+ Typography is lousy; old typewriter fonts of yesteryear were errantly mapped onto today’s spindly Courier type. But as an example of <em>document engineering</em>, scripts are brilliant.
40
+ </p>
41
+
42
+ <ul>
43
+ <li>There’s an entire science involved in text indention. Text is rarely, if ever, “centered”; everything lines up at a <dfn>tab stop</dfn>, a concept that CSS expunges from the collective memory. (You could set left margins using the <a href="http://www.w3.org/TR/css3-values/#lengths" title="CSS3 Values and Units: Lengths"><code>ch</code> unit</a> in CSS3, but nobody does.)</li>
44
+ <li>With careful alignments like these, it’s easy to scan down a screenplay page. Semantic use of ALL CAPITALS aids scanning, and clearly does not live up to the purely mechanical name CSS gives it, “text-transform.”</li>
45
+ </ul>
46
+ <p>
47
+ And now people want to transfer the format—intact—to the web. It’s not going to work.
48
+ </p>
49
+
50
+ <ul>
51
+ <li>Web “pages” may be called that, but the term is metaphorical. It has nothing to do with sheets of paper that equate to screen time. (Right away that means a shooting script’s many headers and footers would disappear, since we’re dealing with only one “page.”)
52
+ </li>
53
+ <li>Nobody seriously intends screenplays on the web to have the same function they do in real life—getting read, getting optioned or bought, and getting shot. All of that happens on paper, not on Firefox.
54
+ </li>
55
+ <li>HTML (per se) <a href="http://www.alistapart.com/articles/semanticsinhtml5" title="A List Apart: Semantics in HTML5">is not extensible</a>. Extensible HTML (XHTML) has really not been extended. Hence the following truism is not going to change: HTML does not have enough tags for the semantics of screenplays, where nearly everything needs its own tag.
56
+
57
+ <ul>
58
+ <li>Dialogue seems to be no problem, but dialogue is intermingled with screen and actor instructions, and in HTML both of those would just be placed in paragraph elements—even though the function, and expected appearance, differ drastically. </li>
59
+
60
+ <li>What about the myriad headings, including the names of people speaking and notations for the time of day and the manner of speech (often called slugs or sluglines)? We have “a lot” of heading tags in HTML—six of them—but they are arranged hierarchically, not according to function. Would class names really suffice here—that is, <code>H2 class="slugline"</code> versus <code>H2 class="charactername"</code>? Really, the answer is no. Script headings and HTML headings are two different things.</li>
61
+ </ul>
62
+ </li>
63
+
64
+ <li>The real movie industry doesn’t need HTML in the first place; it already has viable electronic exchange formats for scripts.
65
+
66
+ <ol>
67
+ <li>One is the proprietary format of <a href="http://www.finaldraft.com/products-and-services/final-draft/" title="‘The Industry Standard’">Final Draft</a>, the software that dominates the screenplay market the way MS Word dominates in offices. Open-source fanatics may look at this as one more delicious chance to inveigh against a proprietary format, but screenwriters have better things to worry about than open source. Anyway, Final Draft 8’s default document format <a href="http://www.finaldraft.com/products-and-services/final-draft/features.php?section=devfeatures#fileformat" title=".fdx, in fact">is now XML</a>.</li>
68
+ <li>The other is PDF. The movie business doesn’t have to care about accessibility, so even PDFs to which no accessibility features have been added suffice for script exchange. You don’t need <a href="http://www.alistapart.com/articles/pdf_accessibility" title="Facts and Opinions About PDF Accessibility">tagged PDF</a>, which also doesn’t have enough semantics for screenplays. (You could, in theory, write your own PDF tags, since they’re just XML.)</li>
69
+ </ol>
70
+ </li>
71
+ </ul>
72
+ <p>
73
+ The quest to adapt scripts to the web recalls other “category errors,” to use Martin Amis’s phrase. Electronic commerce, we eventually figured out, does not take the form of “shopping malls” you “walk” through. “Magazines” and “catalogues” do not have discrete pages you flip (complete with sound effects) and dog-ear. “Web sites” do not <a href="http://blog.fawny.org/2008/09/07/billhillforte/" title="blog.fawny.org: Keep Bill Hill Off the Web">look like magazine layouts</a>, complete with multicolumn text and callouts.
74
+ </p>
75
+
76
+ <p>
77
+ Tellingly, this quest recalls early television, which, conventional wisdom holds, behaved more like filmed stageplays. Bringing scripts to the web is noticeably worse than filming a stageplay.
78
+ </p>
79
+
80
+ <p>
81
+ Now, people have <em>tried</em> to make web pages look exactly like typewritten screenplays. The star of this show is screenwriter and inveterate blogger <a href="http://JohnAugust.com/" title="JohnAugust.com">John August</a>. <a href="http://scrippets.org/" title="Scrippets.org">Scrippets</a>, August’s plug-in for WordPress, Blogger, and other systems, does everything it can to spin straw into gold. Among other things, one of August’s use cases is perfect “screenplay” formatting when viewed in an RSS reader, and the only way to make that happen is through presentational HTML and inline styles. These are, of course, outmoded development methods.
82
+ </p>
83
+
84
+ <p>
85
+ August pitches his project thus (emphasis added): “With Scrippets, you can add boxes of <em>nicely-formatted script</em> to your blog.” That’s actually a restatement of the problem—failed reliance on a page metaphor, failed efforts to duplicate typewriter typography, and failed attempts to replicate one-page-per-minute layout. Script formatting is “nice” for print, but it’s wrong for the web—even for “little boxes” of script content.
86
+ </p>
87
+
88
+ <p>
89
+ Worse, Scrippets ignores whatever small contribution HTML semantics can offer in marking up a screenplay. Pretty much everything gets marked up as paragraphs, but not everything is a paragraph. This is a worse sin than loading up <code>H2</code>s with class names in an uphill battle to notate screenplay semantics.
90
+ </p>
91
+
92
+ <h3>The screenplay solution</h3>
93
+
94
+ <p>
95
+ The way to adapt scripts for the web is through cosmetic surgery. And we have a precedent for it. There’s a healthy market for screenplays published in book form. In fact, “the shooting script” is an actual U.S. trademark (from <a href="http://www.newmarketpress.com/category.asp?id=40" title="The Shooting Script®">Newmarket Press</a>) for one series of book versions of movie screenplays.
96
+ </p>
97
+
98
+ <ul>
99
+ <li>Some books just reprint typewritten screenplays at reduced size. This may make you feel like a pro, but what you should feel is cheated: You’re paying good money to read an author’s typewritten manuscript. Spindly Courier looks even worse in reduced size.</li>
100
+ <li>
101
+ Other books completely redesign <em>typewritten</em> screenplays into a design native to <em>book publishing</em>. In a typical layout, speaker names are run inline with dialogue, normal book margins are used, and there’s a huge compaction of vertical whitespace. Typewritten screenplays read quite well in their intended context—but so do screenplay books in their context. (Retypeset scripts have also been used as <a href="http://www.flickr.com/photos/chrisnoessel/2650132816/" title="Learning English by script">language-learning aids</a>.)</li>
102
+ </ul>
103
+ <p>
104
+ Hence to adapt this existing printed form to the web, you have to abandon all hope of duplicating original typescript formatting. You have to design something native to the web, with its relatively weak semantics and pageless or single-page architecture.
105
+ </p>
106
+
107
+ <ul>
108
+ <li>You could use HTML definition lists to mark up dialogue—<a href="http://www.w3.org/TR/html4/struct/lists.html#edef-DL" title="Lists in HTML documents">explicitly permitted in (W3C-brand) HTML</a>, <a href="http://www.whatwg.org/specs/web-apps/current-work/#the-dl-element" title="HTML5 spec: DL">explicitly banned</a> by Ian Hickson under HTML5. (There, <a href="http://www.whatwg.org/specs/web-apps/current-work/#the-dialog-element" title="HTML5 spec: DIALOG">use <code>DIALOG</code> instead</a>, even though the descendants of that tag, <code>DT</code> and <code>DD</code>, are the same descendants <code>DL</code> has.) </li>
109
+ <li>You can <a href="http://www.script-o-rama.com/movie_scripts/b/the-birds-script-screenplay.html" title="As in ‘The Birds’">use <code>PRE</code></a> to fake indention and line breaks (but you can’t fake the division of a script into page<em>s</em>).</li>
110
+ <li>You can disregard text indention and just <a href="http://joeflood.com/screenplays_new/eurabia/" title="As in ‘Eurabia’">use <code>CENTER</code>ed text</a>.</li>
111
+ <li>You could, without too much of a stretch, mark up a script as a table.</li>
112
+ <li>You could just not bother too much with semantics, run character names (in bold or <code>STRONG</code>) inline with dialogue, and use HTML headings where feasible.</li>
113
+ </ul>
114
+ <h3>Other print formats that need transformation</h3>
115
+
116
+ <ul>
117
+ <li>
118
+ <strong>Mastheads</strong>: The list of who does what at a magazine or newspaper is actually semantically complex, because each person’s title or the department they work in seems to be a heading. But a masthead marked up with <code>H1</code> through <code>H6</code> essentially pollutes the tag stream of the surrounding web page.</li>
119
+ <li>
120
+ <strong>Callouts</strong> and <strong>sidebars</strong>: These structures, familiar from magazines, newspapers, and nonfiction books, cause serious confusion in creating a functioning document tree. (At what exact point in the tag stream are you expected to read the callout or sidebar?)</li>
121
+ <li>
122
+ <strong>Footnotes</strong>: There isn’t a structure for footnotes in HTML (though there is in tagged PDF). Developers have tried all sorts of hacks, including JavaScript show/hide widgets and various rats’ nests of links and reverse links. For literature fans, HTML’s lack of footnotes makes the work of the late David Foster Wallace functionally impossible to render on the web (especially his footnotes within footnotes).</li>
123
+ <li>
124
+ <strong>Charticles</strong>: With origins commonly attributed to <cite><a href="http://fawny.org/spy/" title="Ten Years Ago in ‘Spy’">Spy</a></cite>, a charticle is an illustrated featurette with a lot more accompanying text than what a bare illustration has. By way of comparison, a Flickr photo <a href="http://www.flickr.com/photos/fernandofelix/1382869443/" title="As in this example from Fernando Felix">festooned with notes</a> is functionally identical to a charticle, but HTML has no semantics for it.</li>
125
+ <li>
126
+ <strong>Math and science</strong>: Yes, that old chestnut. Before you exclaim “MathML!” the way a pensioner might yell out “Bingo!,” understand that barely anybody uses MathML on real web pages due to serious authoring difficulty—physicist <a href="http://golem.ph.utexas.edu/~distler/blog/" title="With his blog Musings">Jacques Distler</a> remains among the very few who do.</li>
127
+ </ul>
128
+
129
+ <p>
130
+ Armed with this knowledge, what are we going to do? Prediction: nothing. People will continue to fake the appearance of scripts and use John August–caliber presentational code. But we do have an alternative.
131
+ </p>
132
+
133
+ <p>
134
+ The case typified by screenplays is merely a new variation of the difficulty of encoding literature in XML. People have tried it time and time again over the years, but barely any DTD has gotten traction. People just want to mark up everything in HTML (<a href="http://diveintomark.org/archives/2009/03/27/dive-into-history-2009-edition" title="As Mark Pilgrim has learned">which has staying power</a>). Ill-trained authors mark up everything as a paragraph or a <code>DIV</code>.</p>
135
+
136
+ <p>
137
+ People seem to have taken the catchphrase “HTML is the lingua franca of the web” a bit too literally. HTML derives from SGML; XHTML is XML in a new pair of shoes. That’s four kinds of markup right there, but everybody acts as though there is only one kind, HTML. (Most of the time, browsers act like XTHML is HTML with trailing slashes.) Even electronic books are marked up as HTML, as the ePub file format is essentially XHTML 1.1 inside a container file—but that makes ePub files simultaneously HTML and XML. If we can spit those out, why can’t we spit out other kinds of XML?
138
+ </p>
139
+
140
+ <p>
141
+ We are well past the stage where browsers could <em>not</em> be expected to display valid, well-formed XML. Browsers can now do exactly that. Variant literary document types could actually work now. But because they languished on the vine for so long, now it seems nobody wants to make them work. After all, isn’t our new future wrapped up in HTML5? Just as our old future was wrapped up in XHTML2?
142
+ </p>
143
+
144
+
145
+
146
+ <p>
147
+ The web is, of course, a wondrous thing, but its underlying language lacks the vocabulary to express even the things that humans have already expressed elsewhere. We ought to accept that some documents have to be reformatted for the web, at least if the goal is using plain HTML. To give web documents the rich semantics of print documents, XML is finally a viable option.<img src="/pix/eoai.gif" alt="" id="eoai"></p>
148
+
149
+
150
+
@@ -0,0 +1,356 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
2
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
3
+ <html xmlns="http://www.w3.org/1999/xhtml">
4
+
5
+
6
+ <head>
7
+ <title>A List Apart: Articles: Unwebbable</title>
8
+ <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
9
+ <meta name="description" content="" />
10
+ <meta name="keywords" content="" />
11
+ <link rel="alternate" type="application/rss+xml" title="A List Apart main RSS feed" href="http://www.alistapart.com/site/rss" />
12
+ <link rel="stylesheet" type="text/css" href="/css/print.css" media="print" />
13
+ <script type="text/javascript">if(top!=self){top.location.replace(self.location.href);}</script>
14
+ <link rel="stylesheet" href="/css/article.css" type="text/css" media="all" />
15
+ <style type="text/css">
16
+ @import url(/css/288.css);
17
+ </style>
18
+
19
+ <style type="text/css" media="screen">
20
+ ul li ul { margin-top: 10px; }
21
+
22
+ ul li ol { margin-top: 10px; }
23
+ </style>
24
+ </head>
25
+
26
+
27
+ <body class="articles" onload="">
28
+
29
+ <ul id="navbar">
30
+ <li id="articles"><a href="http://www.alistapart.com/articles/" title="Articles">Articles</a></li>
31
+ <li id="topics"><a href="http://www.alistapart.com/topics/" title="Topics">Topics</a></li>
32
+ <li id="about"><a href="http://www.alistapart.com/about/" title="About">About</a></li>
33
+ <li id="contact"><a href="http://www.alistapart.com/contact/" title="Contact">Contact</a></li>
34
+ <li id="contribute"><a href="http://www.alistapart.com/contribute/" title="Contribute">Contribute</a></li>
35
+ <li id="feed"><a href="http://www.alistapart.com/feed/" title="Feed">Feed</a></li>
36
+ </ul>
37
+ <h1 id="masthead"><a href="http://www.alistapart.com/"><img src="/pix/alalogo.gif" alt="A LIST Apart: For People Who Make Websites" /></a></h1>
38
+
39
+
40
+ <div id="ish">
41
+ <a href="http://www.alistapart.com/issues/288" title="Issue 288">No. <em>288</em></a>
42
+ </div>
43
+
44
+
45
+ <div id="main">
46
+
47
+
48
+
49
+
50
+ <div id="content" class="column">
51
+
52
+ <div class="ishinfo">July <b>21, 2009</b></div>
53
+
54
+ <h1 class="title"><a href="http://www.alistapart.com/articles/unwebbable/">Unwebbable</a></h1>
55
+ <h3 class="byline">
56
+ by <a href="http://www.alistapart.com/authors/c/joeclark"> Joe Clark</a>
57
+
58
+
59
+ </h3>
60
+
61
+ <ul id="metastuff">
62
+ <li>
63
+ Published in: <a href="http://www.alistapart.com/topics/topic/htmlxhtml/" title="HTML and XHTML">HTML and XHTML</a>
64
+ <b>|</b>
65
+ </li>
66
+ <li class="discuss">
67
+ <p><a href="http://www.alistapart.com/comments/unwebbable/">Discuss this article &raquo;</a></p>
68
+ </li>
69
+ </ul>
70
+
71
+ <div id="articletext">
72
+ <div class="illustration right half"><img src="/d/unwebbable/unwebbable.jpg" alt="Unwebbable" /></div>
73
+
74
+ <p>
75
+ <strong>It&#8217;s time we came to grips with the fact that not every &#8220;document&#8221; can be a &#8220;web page.&#8221; Some forms of writing just cannot be expressed in HTML&#8212;or they need to be bent and distorted to do so. But for once, XML might actually help.
76
+ </strong></p>
77
+
78
+ <p>
79
+ The creation myth of the web tells us that Tim Berners-Lee invented HTML as a means of publishing physics research papers. True? It doesn&#8217;t matter; it&#8217;s a founding legend of the web whose legacy continues to this day. You can gin up as many web applications as you want, but the web is mostly still a place to publish <em>documents</em>.
80
+ </p>
81
+
82
+ <p>
83
+ The web is replete with projects to &#8220;digitize legacy content&#8221;&#8212;patent applications, books, photographs, everything. While photographs might survive well as JPEGs or TIFFs (disregarding accessibility issues for a moment), the bulk of this legacy content requires semantic markup for computers to understand it. A sheet of paper provides complete authorial freedom, but that freedom can translate poorly to the coarse semantics of HTML. The digitization craze&#8212;that&#8217;s what it is&#8212;crashes headlong into HTML semantics.
84
+ </p>
85
+
86
+ <p>
87
+ Some documents cannot be published using HTML. In many cases, we shouldn&#8217;t even bother trying. In other cases, we have to radically change the appearance and structure of the document. Ideally, we&#8217;ll start using custom XML document types&#8212;which, finally and at long last, might actually work.
88
+ </p>
89
+
90
+ <h2>The screenplay problem</h2>
91
+
92
+ <p>
93
+ An example of the conundrum of transferring print documents to the web, one that has become legendary in some circles, is the film screenplay.
94
+ </p>
95
+
96
+ <p>
97
+ A lot of people want to write a screenplay. The outcomes for most of these writers are the same: Nobody films and releases their movie. And they all go through the same phase&#8212;learning the generations-old &#8220;style&#8221; of screenplay formatting.
98
+ </p>
99
+
100
+ <div class="illustration full left"><img src="/d/unwebbable/die-hard-script.jpg" alt="Screenplay" /><p>Typewritten screenplay from <cite><a href="http://www.dailyscript.com/scripts/Die_Hard_2.pdf">Die Hard 2</a></cite>.</p></div>
101
+
102
+ <p>
103
+ Originating in the typewriter age, <a href="http://www.flickr.com/photos/joeclark/collections/72157621289042508/">screenplay layouts</a> are custom-engineered so that one printed page (in what we now call U.S. letter size) equals almost exactly one minute of onscreen time. Since most commercial movies run about two hours in length, typical Hollywood movie scripts are 118 to 122 pages long.
104
+ </p>
105
+
106
+ <p>
107
+ Typography is lousy; old typewriter fonts of yesteryear were errantly mapped onto today&#8217;s spindly Courier type. But as an example of <em>document engineering</em>, scripts are brilliant.
108
+ </p>
109
+
110
+ <ul>
111
+ <li>There&#8217;s an entire science involved in text indention. Text is rarely, if ever, &#8220;centered&#8221;; everything lines up at a <dfn>tab stop</dfn>, a concept that CSS expunges from the collective memory. (You could set left margins using the <a href="http://www.w3.org/TR/css3-values/#lengths" title="CSS3 Values and Units: Lengths"><code>ch</code> unit</a> in CSS3, but nobody does.)</li>
112
+ <li>With careful alignments like these, it&#8217;s easy to scan down a screenplay page. Semantic use of ALL CAPITALS aids scanning, and clearly does not live up to the purely mechanical name CSS gives it, &#8220;text-transform.&#8221;</li>
113
+ </ul>
114
+ <p>
115
+ And now people want to transfer the format&#8212;intact&#8212;to the web. It&#8217;s not going to work.
116
+ </p>
117
+
118
+ <ul>
119
+ <li>Web &#8220;pages&#8221; may be called that, but the term is metaphorical. It has nothing to do with sheets of paper that equate to screen time. (Right away that means a shooting script&#8217;s many headers and footers would disappear, since we&#8217;re dealing with only one &#8220;page.&#8221;)
120
+ </li>
121
+ <li>Nobody seriously intends screenplays on the web to have the same function they do in real life&#8212;getting read, getting optioned or bought, and getting shot. All of that happens on paper, not on Firefox.
122
+ </li>
123
+ <li>HTML (per se) <a href="http://www.alistapart.com/articles/semanticsinhtml5" title="A List Apart: Semantics in HTML5">is not extensible</a>. Extensible HTML (XHTML) has really not been extended. Hence the following truism is not going to change: HTML does not have enough tags for the semantics of screenplays, where nearly everything needs its own tag.
124
+
125
+ <ul>
126
+ <li>Dialogue seems to be no problem, but dialogue is intermingled with screen and actor instructions, and in HTML both of those would just be placed in paragraph elements&#8212;even though the function, and expected appearance, differ drastically. </li>
127
+
128
+ <li>What about the myriad headings, including the names of people speaking and notations for the time of day and the manner of speech (often called slugs or sluglines)? We have &#8220;a lot&#8221; of heading tags in HTML&#8212;six of them&#8212;but they are arranged hierarchically, not according to function. Would class names really suffice here&#8212;that is, <code>H2 class="slugline"</code> versus <code>H2 class="charactername"</code>? Really, the answer is no. Script headings and HTML headings are two different things.</li></ul>
129
+ </li>
130
+
131
+ <li>The real movie industry doesn&#8217;t need HTML in the first place; it already has viable electronic exchange formats for scripts.
132
+
133
+ <ol>
134
+ <li>One is the proprietary format of <a href="http://www.finaldraft.com/products-and-services/final-draft/" title="‘The Industry Standard’">Final Draft</a>, the software that dominates the screenplay market the way MS Word dominates in offices. Open-source fanatics may look at this as one more delicious chance to inveigh against a proprietary format, but screenwriters have better things to worry about than open source. Anyway, Final Draft 8&#8217;s default document format <a href="http://www.finaldraft.com/products-and-services/final-draft/features.php?section=devfeatures#fileformat" title=".fdx, in fact">is now XML</a>.</li>
135
+ <li>The other is PDF. The movie business doesn&#8217;t have to care about accessibility, so even PDFs to which no accessibility features have been added suffice for script exchange. You don&#8217;t need <a href="http://www.alistapart.com/articles/pdf_accessibility" title="Facts and Opinions About PDF Accessibility">tagged PDF</a>, which also doesn&#8217;t have enough semantics for screenplays. (You could, in theory, write your own PDF tags, since they&#8217;re just XML.)</li>
136
+ </ol>
137
+
138
+ </li></ul>
139
+
140
+ <p>
141
+ The quest to adapt scripts to the web recalls other &#8220;category errors,&#8221; to use Martin Amis&#8217;s phrase. Electronic commerce, we eventually figured out, does not take the form of &#8220;shopping malls&#8221; you &#8220;walk&#8221; through. &#8220;Magazines&#8221; and &#8220;catalogues&#8221; do not have discrete pages you flip (complete with sound effects) and dog-ear. &#8220;Web sites&#8221; do not <a href="http://blog.fawny.org/2008/09/07/billhillforte/" title="blog.fawny.org: Keep Bill Hill Off the Web">look like magazine layouts</a>, complete with multicolumn text and callouts.
142
+ </p>
143
+
144
+ <p>
145
+ Tellingly, this quest recalls early television, which, conventional wisdom holds, behaved more like filmed stageplays. Bringing scripts to the web is noticeably worse than filming a stageplay.
146
+ </p>
147
+
148
+ <p>
149
+ Now, people have <em>tried</em> to make web pages look exactly like typewritten screenplays. The star of this show is screenwriter and inveterate blogger <a href="http://JohnAugust.com/" title="JohnAugust.com">John August</a>. <a href="http://scrippets.org/" title="Scrippets.org">Scrippets</a>, August&#8217;s plug-in for WordPress, Blogger, and other systems, does everything it can to spin straw into gold. Among other things, one of August&#8217;s use cases is perfect &#8220;screenplay&#8221; formatting when viewed in an RSS reader, and the only way to make that happen is through presentational HTML and inline styles. These are, of course, outmoded development methods.
150
+ </p>
151
+
152
+ <p>
153
+ August pitches his project thus (emphasis added): &#8220;With Scrippets, you can add boxes of <em>nicely-formatted script</em> to your blog.&#8221; That&#8217;s actually a restatement of the problem&#8212;failed reliance on a page metaphor, failed efforts to duplicate typewriter typography, and failed attempts to replicate one-page-per-minute layout. Script formatting is &#8220;nice&#8221; for print, but it&#8217;s wrong for the web&#8212;even for &#8220;little boxes&#8221; of script content.
154
+ </p>
155
+
156
+ <p>
157
+ Worse, Scrippets ignores whatever small contribution HTML semantics can offer in marking up a screenplay. Pretty much everything gets marked up as paragraphs, but not everything is a paragraph. This is a worse sin than loading up <code>H2</code>s with class names in an uphill battle to notate screenplay semantics.
158
+ </p>
159
+
160
+ <h3>The screenplay solution</h3>
161
+
162
+ <p>
163
+ The way to adapt scripts for the web is through cosmetic surgery. And we have a precedent for it. There&#8217;s a healthy market for screenplays published in book form. In fact, &#8220;the shooting script&#8221; is an actual U.S. trademark (from <a href="http://www.newmarketpress.com/category.asp?id=40" title="The Shooting Script&reg;">Newmarket Press</a>) for one series of book versions of movie screenplays.
164
+ </p>
165
+
166
+ <ul>
167
+ <li>Some books just reprint typewritten screenplays at reduced size. This may make you feel like a pro, but what you should feel is cheated: You&#8217;re paying good money to read an author&#8217;s typewritten manuscript. Spindly Courier looks even worse in reduced size.</li>
168
+ <li>
169
+ Other books completely redesign <em>typewritten</em> screenplays into a design native to <em>book publishing</em>. In a typical layout, speaker names are run inline with dialogue, normal book margins are used, and there&#8217;s a huge compaction of vertical whitespace. Typewritten screenplays read quite well in their intended context&#8212;but so do screenplay books in their context. (Retypeset scripts have also been used as <a href="http://www.flickr.com/photos/chrisnoessel/2650132816/" title="Learning English by script">language-learning aids</a>.)</li>
170
+ </ul>
171
+
172
+ <p>
173
+ Hence to adapt this existing printed form to the web, you have to abandon all hope of duplicating original typescript formatting. You have to design something native to the web, with its relatively weak semantics and pageless or single-page architecture.
174
+ </p>
175
+
176
+ <ul>
177
+ <li>You could use HTML definition lists to mark up dialogue&#8212;<a href="http://www.w3.org/TR/html4/struct/lists.html#edef-DL" title="Lists in HTML documents">explicitly permitted in (W3C-brand) HTML</a>, <a href="http://www.whatwg.org/specs/web-apps/current-work/#the-dl-element" title="HTML5 spec: DL">explicitly banned</a> by Ian Hickson under HTML5. (There, <a href="http://www.whatwg.org/specs/web-apps/current-work/#the-dialog-element" title="HTML5 spec: DIALOG">use <code>DIALOG</code> instead</a>, even though the descendants of that tag, <code>DT</code> and <code>DD</code>, are the same descendants <code>DL</code> has.) </li>
178
+ <li>You can <a href="http://www.script-o-rama.com/movie_scripts/b/the-birds-script-screenplay.html" title="As in ‘The Birds’">use <code>PRE</code></a> to fake indention and line breaks (but you can&#8217;t fake the division of a script into page<em>s</em>).</li>
179
+ <li>You can disregard text indention and just <a href="http://joeflood.com/screenplays_new/eurabia/" title="As in ‘Eurabia’">use <code>CENTER</code>ed text</a>.</li>
180
+ <li>You could, without too much of a stretch, mark up a script as a table.</li>
181
+ <li>You could just not bother too much with semantics, run character names (in bold or <code>STRONG</code>) inline with dialogue, and use HTML headings where feasible.</li>
182
+ </ul>
183
+
184
+
185
+
186
+ <h3>Other print formats that need transformation</h3>
187
+
188
+ <ul>
189
+ <li><strong>Mastheads</strong>: The list of who does what at a magazine or newspaper is actually semantically complex, because each person&#8217;s title or the department they work in seems to be a heading. But a masthead marked up with <code>H1</code> through <code>H6</code> essentially pollutes the tag stream of the surrounding web page.</li>
190
+ <li><strong>Callouts</strong> and <strong>sidebars</strong>: These structures, familiar from magazines, newspapers, and nonfiction books, cause serious confusion in creating a functioning document tree. (At what exact point in the tag stream are you expected to read the callout or sidebar?)</li>
191
+ <li><strong>Footnotes</strong>: There isn&#8217;t a structure for footnotes in HTML (though there is in tagged PDF). Developers have tried all sorts of hacks, including JavaScript show/hide widgets and various rats&#8217; nests of links and reverse links. For literature fans, HTML&#8217;s lack of footnotes makes the work of the late David Foster Wallace functionally impossible to render on the web (especially his footnotes within footnotes).</li>
192
+ <li><strong>Charticles</strong>: With origins commonly attributed to <cite><a href="http://fawny.org/spy/" title="Ten Years Ago in ‘Spy’">Spy</a></cite>, a charticle is an illustrated featurette with a lot more accompanying text than what a bare illustration has. By way of comparison, a Flickr photo <a href="http://www.flickr.com/photos/fernandofelix/1382869443/" title="As in this example from Fernando Felix">festooned with notes</a> is functionally identical to a charticle, but HTML has no semantics for it.</li>
193
+ <li><strong>Math and science</strong>: Yes, that old chestnut. Before you exclaim &#8220;MathML!&#8221; the way a pensioner might yell out &#8220;Bingo!,&#8221; understand that barely anybody uses MathML on real web pages due to serious authoring difficulty&#8212;physicist <a href="http://golem.ph.utexas.edu/~distler/blog/" title="With his blog Musings">Jacques Distler</a> remains among the very few who do.</li>
194
+ </ul>
195
+
196
+
197
+ <h2>How do we solve the problem?</h2>
198
+
199
+ <p>
200
+ Armed with this knowledge, what are we going to do? Prediction: nothing. People will continue to fake the appearance of scripts and use John August–caliber presentational code. But we do have an alternative.
201
+ </p>
202
+
203
+ <p>
204
+ The case typified by screenplays is merely a new variation of the difficulty of encoding literature in XML. People have tried it time and time again over the years, but barely any DTD has gotten traction. People just want to mark up everything in HTML (<a href="http://diveintomark.org/archives/2009/03/27/dive-into-history-2009-edition" title="As Mark Pilgrim has learned">which has staying power</a>). Ill-trained authors mark up everything as a paragraph or a <code>DIV</code>.</p>
205
+
206
+ <p>
207
+ People seem to have taken the catchphrase &#8220;HTML is the lingua franca of the web&#8221; a bit too literally. HTML derives from SGML; XHTML is XML in a new pair of shoes. That&#8217;s four kinds of markup right there, but everybody acts as though there is only one kind, HTML. (Most of the time, browsers act like XTHML is HTML with trailing slashes.) Even electronic books are marked up as HTML, as the ePub file format is essentially XHTML 1.1 inside a container file&#8212;but that makes ePub files simultaneously HTML and XML. If we can spit those out, why can&#8217;t we spit out other kinds of XML?
208
+ </p>
209
+
210
+ <p>
211
+ We are well past the stage where browsers could <em>not</em> be expected to display valid, well-formed XML. Browsers can now do exactly that. Variant literary document types could actually work now. But because they languished on the vine for so long, now it seems nobody wants to make them work. After all, isn&#8217;t our new future wrapped up in HTML5? Just as our old future was wrapped up in XHTML2?
212
+ </p>
213
+
214
+ <h2>Conclusion</h2>
215
+
216
+ <p>
217
+ The web is, of course, a wondrous thing, but its underlying language lacks the vocabulary to express even the things that humans have already expressed elsewhere. We ought to accept that some documents have to be reformatted for the web, at least if the goal is using plain HTML. To give web documents the rich semantics of print documents, XML is finally a viable option.<img src="/pix/eoai.gif" alt="" id="eoai" />
218
+ </p>
219
+
220
+ <div id="credits">
221
+ <ul>
222
+ <li>Illustration by <a href="/authors/c/kevincornell">Kevin Cornell</a></li>
223
+ </ul>
224
+ </div>
225
+ </div>
226
+
227
+ <div id="learnmore">
228
+ <h2>Learn More</h2>
229
+ <p>Related Topics: <a href="http://www.alistapart.com/topics/topic/htmlxhtml/" title="HTML and XHTML">HTML and XHTML</a></p>
230
+ </div>
231
+
232
+ <div class="discuss">
233
+ <h2>Discuss</h2>
234
+ <p>Was it good for you, too? <a href="http://www.alistapart.com/comments/unwebbable/">Join the discussion &raquo;</a></p>
235
+ </div>
236
+
237
+ <div id="authorbio">
238
+
239
+ <h2>About the Author</h2>
240
+
241
+ <p>
242
+ <img src="/pix/authors/joe_clark.jpg" alt=" Joe Clark" />
243
+
244
+ Toronto journalist and author <a href="http://joeclark.org/" id="joeclark-access" name="joeclark-access" title="Joe Clark">Joe Clark</a> used to work in the field of web accessibility. His ongoing missions are to raise enough money to start his own research project and to publish further books.
245
+ </p>
246
+
247
+
248
+
249
+ </div>
250
+
251
+ </div>
252
+
253
+ <div id="sidebar" class="column">
254
+
255
+ <div class="first">
256
+ <form method="post" action="http://www.alistapart.com/" >
257
+ <div class='hiddenFields'>
258
+ <input type="hidden" name="ACT" value="19" />
259
+ <input type="hidden" name="XID" value="51c03fb459d46dbaa5c114e2090b5ab722cebf7f" />
260
+ <input type="hidden" name="RP" value="search/results" />
261
+ <input type="hidden" name="NRP" value="" />
262
+ <input type="hidden" name="RES" value="20" />
263
+ <input type="hidden" name="status" value="" />
264
+ <input type="hidden" name="weblog" value="articles|issues" />
265
+ <input type="hidden" name="search_in" value="entries" />
266
+ <input type="hidden" name="where" value="all" />
267
+ <input type="hidden" name="site_id" value="1" />
268
+ </div>
269
+
270
+
271
+ <h3>Search ALA</h3>
272
+ <input type="text" name="keywords" id="search" />
273
+ <input type="image" src="/pix/go.gif" id="submit" value="Search" />
274
+ <p><input type="checkbox" name="incdisc" id="incdisc" value="comments|entries" onclick="this.form.elements['search_in'].value = (this.checked) ? 'everywhere' : 'entries';" /> include discussions</p>
275
+ </form>
276
+ </div>
277
+
278
+ <div id="topiclist">
279
+ <h3>Topics</h3>
280
+
281
+ <ul>
282
+
283
+ <li><a href="/topics/code/" title="Code">Code</a></li>
284
+
285
+ <li><a href="/topics/content/" title="Content">Content</a></li>
286
+
287
+ <li><a href="/topics/culture/" title="Culture">Culture</a></li>
288
+
289
+ <li><a href="/topics/design/" title="Design">Design</a></li>
290
+
291
+ <li><a href="/topics/process/" title="Process">Process</a></li>
292
+
293
+ <li><a href="/topics/userscience/" title="User Science">User Science</a></li>
294
+
295
+ </ul>
296
+
297
+ </div>
298
+
299
+
300
+ <div id="snapshot">
301
+ <h3>Snapshot</h3>
302
+ <p>The creation myth of the web tells us that Tim Berners-Lee invented HTML as a means of publishing physics research papers. True? It doesn’t matter; it’s a founding legend of the web whose legacy continues to this day. You can gin up as many web applications as you want, but the web is mostly still a place to publish documents.</p>
303
+ </div>
304
+
305
+
306
+ <div id="lucre">
307
+ <script type="text/javascript">
308
+ //<![CDATA[
309
+ (function(id) {
310
+ document.write('<script type="text/javascript" src="' +
311
+ 'http://www.northmay.com/deck/deck' + id + '_js.php?' +
312
+ (new Date().getTime()) + '"></' + 'script>');
313
+ })("AL");
314
+ //]]>
315
+ </script>
316
+ <p>
317
+ <a href="http://www.coudal.com/deck/">Ad via The Deck</a>
318
+ </p>
319
+ </div>
320
+
321
+ <div id="jobboard">
322
+ <h4>Job Board</h4>
323
+ <script src="http://www.37signals.com/svn/job.fcgi" type="text/javascript"></script>
324
+ </div>
325
+
326
+ <div id="colophon">
327
+
328
+ <p class="init">
329
+ Hosted by
330
+ <a href="http://mediatemple.net/"><img src="/pix/mediatemple.png" alt="Hosted by Media Temple" /></a>
331
+ </p>
332
+
333
+ <p>
334
+ Published by
335
+ <a href="http://happycog.com/"><img src="/pix/happycog.png" title="" alt="Published by Happy Cog" /></a>
336
+ </p>
337
+ </div>
338
+
339
+ </div>
340
+
341
+
342
+
343
+ </div>
344
+
345
+ <div id="footer">
346
+ <p>
347
+ <span class="issn">ISSN: <b>1534-0295</b></span>
348
+ <span class="copyright"><a href="/copyright/" rel="license">Copyright &copy;</a> <span class="years">1998-2009</span> A List Apart Magazine and the authors.</span>
349
+ </p>
350
+ </div>
351
+
352
+ <script src="/d/mint/?js" type="text/javascript"></script>
353
+
354
+
355
+ </body>
356
+ </html>
@@ -0,0 +1,71 @@
1
+ require 'test_helper'
2
+
3
+ class ScraperTest < Test::Unit::TestCase
4
+ context "given a Youtube URL" do
5
+ setup do
6
+ @url = "http://www.youtube.com/watch?v=dLO2s7SDHJo&feature=rec-HM-r2"
7
+ end
8
+
9
+ should "be able to make a youtube object without failing" do
10
+ assert_nothing_raised do
11
+ Scraper::Youtube.new(:url => @url)
12
+ end
13
+ end
14
+ end
15
+
16
+ context "Scraper( <youtube url > )" do
17
+ setup do
18
+ @url = "http://www.youtube.com/watch?v=dLO2s7SDHJo&feature=rec-HM-r2"
19
+ @scraper = Scraper( :url => @url )
20
+ end
21
+
22
+ should "return a Scraper::Youtube object" do
23
+ assert_instance_of Scraper::Youtube, @scraper
24
+ end
25
+ end
26
+
27
+ context "given an article from A-List-Apart" do
28
+ setup do
29
+ @article = fixture_file('unwebbable.html')
30
+ end
31
+
32
+ should "be able to make an article object without failing" do
33
+ assert_nothing_raised do
34
+ Scraper::Article.new(:content => @article)
35
+ end
36
+ end
37
+
38
+ context "when extracting the actual content using the URL" do
39
+ setup do
40
+ @url = "http://www.alistapart.com/articles/unwebbable/"
41
+ @scraper1 = Scraper::Article.new(:content => @article)
42
+ @scraper2 = Scraper::Article.new(:url => @url)
43
+ end
44
+
45
+ should "have the same HTML extracted" do
46
+ assert_equal @scraper1.html, @scraper2.html
47
+ end
48
+ end
49
+ end
50
+
51
+ context "Scraper( <alist apart content >)" do
52
+ setup do
53
+ @article = fixture_file('unwebbable.html')
54
+ end
55
+
56
+ should "return an instance of Article" do
57
+ assert_instance_of Scraper::Article, Scraper( :content => @article )
58
+ end
59
+ end
60
+
61
+ context "Scraper( <alist apart url> )" do
62
+ setup do
63
+ @url = "http://www.alistapart.com/articles/unwebbable/"
64
+ end
65
+
66
+ should "return an instance of Article" do
67
+ assert_instance_of Scraper::Article, Scraper( :url => @url )
68
+ end
69
+ end
70
+
71
+ end
@@ -0,0 +1,15 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+
5
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
6
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
7
+ require 'scraper'
8
+
9
+ class Test::Unit::TestCase
10
+ @@fixture_path = File.dirname(__FILE__) + '/fixtures/'
11
+
12
+ def fixture_file( file )
13
+ File.read(@@fixture_path + file)
14
+ end
15
+ end
@@ -0,0 +1,83 @@
1
+ require 'test_helper'
2
+ require 'hpricot'
3
+
4
+ class Scraper::YoutubeTest < Test::Unit::TestCase
5
+ context "given http://www.youtube.com/watch?v=dLO2s7SDHJo&feature=rec-HM-r2" do
6
+ setup do
7
+ @youtube = Scraper::Youtube.new(
8
+ :url => "http://www.youtube.com/watch?v=dLO2s7SDHJo&feature=rec-HM-r2"
9
+ )
10
+ end
11
+
12
+ should "have a video_id dLO2s7SDHJo" do
13
+ assert_equal "dLO2s7SDHJo", @youtube.video_id
14
+ end
15
+ end
16
+
17
+ context "given http://www.youtube.com/watch?feature=rec-HM-r2&v=dLO2s7SDHJo" do
18
+ setup do
19
+ @youtube = Scraper::Youtube.new(
20
+ :url => "http://www.youtube.com/watch?feature=rec-HM-r2&v=dLO2s7SDHJo"
21
+ )
22
+ end
23
+
24
+ should "have a video_id dLO2s7SDHJo" do
25
+ assert_equal "dLO2s7SDHJo", @youtube.video_id
26
+ end
27
+ end
28
+
29
+ context "given http://vimeo.com/5702579" do
30
+ should "raise an ArgumentError" do
31
+ assert_raise ArgumentError do
32
+ Scraper::Youtube.new(:url => "http://vimeo.com/5702579")
33
+ end
34
+ end
35
+ end
36
+
37
+ context "given http://www.youtube.com/watch?feature=rec-HM-r2" do
38
+ should "raise an ArgumentError" do
39
+ assert_raise ArgumentError do
40
+ Scraper::Youtube.new(:url =>
41
+ "http://www.youtube.com/watch?feature=rec-HM-r2"
42
+ )
43
+ end
44
+ end
45
+ end
46
+
47
+ context "HTML given a 1024x760 dimension configuration" do
48
+ setup do
49
+ @youtube = Scraper::Youtube.new(
50
+ :url => "http://www.youtube.com/watch?feature=rec-HM-r2&v=dLO2s7SDHJo"
51
+ )
52
+ @doc = Hpricot(@youtube.html(:width => 1024, :height => 768))
53
+ @embed = @doc.search('embed').first
54
+ @object = @doc.search('object').first
55
+ end
56
+
57
+ should "have an embed tag with 1024 width" do
58
+ assert_equal '1024', @embed.attributes['width']
59
+ end
60
+
61
+ should "have an embed tag with 768 height" do
62
+ assert_equal '768', @embed.attributes['height']
63
+ end
64
+
65
+ should "have an embed tag with the movie's id in its src" do
66
+ assert_match(/dLO2s7SDHJo/, @embed.attributes['src'])
67
+ end
68
+
69
+ should "have an object tag with 1024 width" do
70
+ assert_equal '1024', @object.attributes['width']
71
+ end
72
+
73
+ should "have an object tag with 768 height" do
74
+ assert_equal '768', @object.attributes['height']
75
+ end
76
+
77
+ should "have an object tag with the movie's id in its params" do
78
+ param = @object.search('param').detect { |p| p.attributes['name'] == 'movie' }
79
+
80
+ assert_match(/dLO2s7SDHJo/, param.attributes['value'])
81
+ end
82
+ end
83
+ end
metadata ADDED
@@ -0,0 +1,73 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: cyx-scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ platform: ruby
6
+ authors:
7
+ - Cyril David
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-07-31 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description:
17
+ email: cyx.ucron@gmail.com
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - LICENSE
24
+ - README.markdown
25
+ files:
26
+ - .document
27
+ - .gitignore
28
+ - LICENSE
29
+ - README.markdown
30
+ - Rakefile
31
+ - VERSION
32
+ - lib/scraper.rb
33
+ - lib/scraper/article.rb
34
+ - lib/scraper/youtube.rb
35
+ - scraper.gemspec
36
+ - test/article_test.rb
37
+ - test/fixtures/scraped.html
38
+ - test/fixtures/unwebbable.html
39
+ - test/scraper_test.rb
40
+ - test/test_helper.rb
41
+ - test/youtube_test.rb
42
+ has_rdoc: false
43
+ homepage: http://github.com/cyx/scraper
44
+ licenses:
45
+ post_install_message:
46
+ rdoc_options:
47
+ - --charset=UTF-8
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: "0"
55
+ version:
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: "0"
61
+ version:
62
+ requirements: []
63
+
64
+ rubyforge_project:
65
+ rubygems_version: 1.3.5
66
+ signing_key:
67
+ specification_version: 3
68
+ summary: TODO
69
+ test_files:
70
+ - test/article_test.rb
71
+ - test/scraper_test.rb
72
+ - test/test_helper.rb
73
+ - test/youtube_test.rb