iudex-html 1.0.0-java

Sign up to get free protection for your applications and to get access to all the features.
data/History.rdoc ADDED
@@ -0,0 +1,2 @@
1
+ === 1.0.0 (2011-04-04)
2
+ * Initial release.
data/Manifest.txt ADDED
@@ -0,0 +1,24 @@
1
+ History.rdoc
2
+ Manifest.txt
3
+ README.rdoc
4
+ Rakefile
5
+ pom.xml
6
+ build/HTML.java.erb
7
+ build/attributes
8
+ build/java_generate.rb
9
+ build/tags
10
+ lib/iudex-html/base.rb
11
+ lib/iudex-html.rb
12
+ lib/iudex-html/factory_helper.rb
13
+ test/html_test_helper.rb
14
+ test/setup.rb
15
+ test/test_characters_normalizer.rb
16
+ test/test_extract_filter.rb
17
+ test/test_factory_helper.rb
18
+ test/test_html_parser.rb
19
+ test/test_other_filters.rb
20
+ test/test_other_tree_filters.rb
21
+ test/test_parse_filter.rb
22
+ test/test_tree_walker.rb
23
+ test/test_word_counters.rb
24
+ lib/iudex-html/iudex-html-1.0.0.jar
data/README.rdoc ADDED
@@ -0,0 +1,25 @@
1
+ = iudex-html
2
+
3
+ * http://github.com/dekellum/iudex
4
+
5
+ == Description
6
+
7
+ Iudex is a general purpose web crawler and feed processor in
8
+ ruby/java. The iudex-html gem contains filters for HTML parsing,
9
+ filtering, exracting text and links.
10
+
11
+ == License
12
+
13
+ Copyright (c) 2010-2011 David Kellum
14
+
15
+ Licensed under the Apache License, Version 2.0 (the "License"); you
16
+ may not use this file except in compliance with the License. You
17
+ may obtain a copy of the License at:
18
+
19
+ http://www.apache.org/licenses/LICENSE-2.0
20
+
21
+ Unless required by applicable law or agreed to in writing, software
22
+ distributed under the License is distributed on an "AS IS" BASIS,
23
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24
+ implied. See the License for the specific language governing
25
+ permissions and limitations under the License.
data/Rakefile ADDED
@@ -0,0 +1,53 @@
1
+ # -*- ruby -*-
2
+
3
+ $LOAD_PATH << './lib'
4
+ require 'iudex-html/base'
5
+
6
+ require 'rubygems'
7
+ gem 'rjack-tarpit', '~> 1.2'
8
+ require 'rjack-tarpit'
9
+
10
+ t = RJack::TarPit.new( 'iudex-html',
11
+ Iudex::HTML::VERSION,
12
+ :no_assembly, :java_platform )
13
+
14
+ t.specify do |h|
15
+ h.developer( "David Kellum", "dek-oss@gravitext.com" )
16
+ h.extra_deps += [ [ 'iudex-core', '~> 1.0.0' ],
17
+ [ 'rjack-nekohtml', '~> 1.9.14' ],
18
+ [ 'gravitext-xmlprod', '~> 1.4.0' ] ]
19
+
20
+ h.testlib = :minitest
21
+ h.extra_dev_deps += [ [ 'minitest', '>= 1.7.1', '< 2.1' ],
22
+ [ 'rjack-logback', '~> 1.0' ] ]
23
+ end
24
+
25
+ file 'Manifest.txt' => [ 'pom.xml' ]
26
+
27
+ task :check_pom_version do
28
+ t.test_line_match( 'pom.xml', /<version>/, /#{t.version}/ )
29
+ end
30
+ task :check_history_version do
31
+ t.test_line_match( 'History.rdoc', /^==/, / #{t.version} / )
32
+ end
33
+ task :check_history_date do
34
+ t.test_line_match( 'History.rdoc', /^==/, /\([0-9\-]+\)$/ )
35
+ end
36
+
37
+ task :gem => [ :check_pom_version, :check_history_version ]
38
+ task :tag => [ :check_pom_version, :check_history_version, :check_history_date ]
39
+ task :push => [ :check_history_date ]
40
+
41
+ file 'target/.tarpit' => [ 'src/main/java/iudex/html/HTML.java' ]
42
+
43
+ file 'src/main/java/iudex/html/HTML.java' => FileList.new( "build/*" ) do
44
+ require 'build/java_generate'
45
+ puts "Generating HTML.java"
46
+ JavaGenerator.new.run
47
+ end
48
+
49
+ task :clean do
50
+ rm_f 'src/main/java/iudex/html/HTML.java'
51
+ end
52
+
53
+ t.define_tasks
@@ -0,0 +1,91 @@
1
+ /*
2
+ * Copyright (c) 2010-2011 David Kellum
3
+ *
4
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ * may not use this file except in compliance with the License. You may
6
+ * obtain a copy of the License at
7
+ *
8
+ * http://www.apache.org/licenses/LICENSE-2.0
9
+ *
10
+ * Unless required by applicable law or agreed to in writing, software
11
+ * distributed under the License is distributed on an "AS IS" BASIS,
12
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ * implied. See the License for the specific language governing
14
+ * permissions and limitations under the License.
15
+ */
16
+
17
+ package iudex.html;
18
+
19
+ import java.util.Arrays;
20
+ import java.util.Collections;
21
+ import java.util.List;
22
+ import java.util.HashMap;
23
+ import java.util.Map;
24
+
25
+ import iudex.html.HTMLTag.Flag;
26
+ import static iudex.html.HTMLTag.Flag.*;
27
+
28
+ import com.gravitext.xml.producer.Namespace;
29
+ import com.gravitext.xml.producer.Attribute;
30
+
31
+ /**
32
+ * HTML Tag constants
33
+ * This class is GENERATED by java_generate.rb.
34
+ */
35
+ public class HTML
36
+ {
37
+ public static final Namespace NS_XHTML =
38
+ new Namespace( Namespace.DEFAULT, "http://www.w3.org/1999/xhtml" );
39
+
40
+ public static final Map<String,HTMLTag> TAGS =
41
+ new HashMap<String,HTMLTag>( 127 );
42
+
43
+ public static final Map<String,Attribute> ATTRIBUTES =
44
+ new HashMap<String,Attribute>( 59 );
45
+
46
+ public static final List<Attribute> EMPTY_ATTS = Collections.emptyList();
47
+ % attributes.each do |a|
48
+ % if a.desc
49
+
50
+ /**
51
+ * Attribute <%= a.name %>: <%= a.desc %>
52
+ */
53
+ % end
54
+ public static final Attribute <%= awidth( 'ATTR_' + const( a.name.upcase ), 5 ) %> = attr( <%= awidth( '"' + a.name + '"', 2 ) %> );
55
+ % end
56
+
57
+ % tags.each do |tag|
58
+ % targs = [ '"' + tag.name + '"' ]
59
+ % basic_atts = tag.basic_atts.map { |a| 'ATTR_' + const( a.name.upcase ) }
60
+ % targs << if basic_atts.empty?
61
+ % "EMPTY_ATTS"
62
+ % else
63
+ % "Arrays.asList( #{ basic_atts.join( ', ' ) } )"
64
+ % end
65
+ % targs += tag.flags
66
+ % if tag.desc
67
+ /**
68
+ * Tag &lt;<%= tag.name %>>: <%= tag.desc %>
69
+ */
70
+ % end
71
+ public static final HTMLTag <%= twidth( tag.name.upcase ) %> =
72
+ tag( <%= targs.join( ', ' ) %> );
73
+
74
+ % end
75
+
76
+ private static HTMLTag tag( String name,
77
+ List<Attribute> basicAtts,
78
+ Flag...flags )
79
+ {
80
+ HTMLTag t = new HTMLTag( name, NS_XHTML, basicAtts, flags );
81
+ TAGS.put( t.name(), t );
82
+ return t;
83
+ }
84
+
85
+ private static Attribute attr( String name )
86
+ {
87
+ Attribute a = new Attribute( name, NS_XHTML );
88
+ ATTRIBUTES.put( a.name(), a );
89
+ return a;
90
+ }
91
+ }
data/build/attributes ADDED
@@ -0,0 +1,82 @@
1
+ # HTML Attributes
2
+ #
3
+ # Copyright (c) 2010-2011 David Kellum
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
6
+ # may not use this file except in compliance with the License. You may
7
+ # obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14
+ # implied. See the License for the specific language governing
15
+ # permissions and limitations under the License.
16
+ #
17
+ # Format:
18
+ # (CSV like) columns: name, tags, description
19
+ # Tags marked with asterisk (*): attribute is for style purposes only.
20
+ #
21
+ # Sources
22
+ # http://www.w3.org/TR/xhtml11/
23
+ # http://www.w3.org/TR/html4/
24
+ # http://www.w3schools.com/tags/ref_standardattributes.asp
25
+ # http://xhtml.com
26
+
27
+ CORE :: ALL except: base head html meta param script style title
28
+ class ,*CORE
29
+ id ,*CORE
30
+ style ,*CORE
31
+ title ,CORE, extra title
32
+
33
+ LANG :: ALL except: base br frame frameset hr iframe param
34
+ dir ,LANG, Text direction; ltr or rtl
35
+ lang ,LANG, language_code; also xml:lang
36
+
37
+ # Meta tag attributes
38
+ http-equiv ,meta, HTTP Header name
39
+ content ,meta, text
40
+ scheme ,meta, format URI
41
+
42
+ # Anchor and link attributes
43
+ charset ,a link, char_encoding of link
44
+ coords ,*a, coordinates; i.e. image map
45
+ hreflang ,link, language_code of referent
46
+ href ,a base link, URL
47
+ media ,link
48
+ name ,a, section_name anchor
49
+ rel ,a link
50
+ rev ,a link
51
+ shape ,*a
52
+ target ,*a *base *link
53
+ type ,link
54
+
55
+ # Image and some frame attributes
56
+ src ,frame img
57
+ alt ,img
58
+ height ,img *tr *th *td *iframe *object
59
+ width ,img *table *tr *th *td *iframe *object
60
+
61
+ # Table specific attributes
62
+ abbr ,tr th
63
+ align ,table tr td th iframe object
64
+ axis ,tr th
65
+ bgcolor ,*table *tr *td *th
66
+ border ,*table
67
+ cellpadding ,*table
68
+ cellspacing ,*table
69
+ char ,tr td th
70
+ charoff ,tr td th
71
+ colspan ,tr td th
72
+ frame ,*table
73
+ headers ,tr td
74
+ nowrap ,*tr *td *th
75
+ rowspan ,tr td th
76
+ rules ,*table
77
+ scope ,tr td th
78
+ summary ,table
79
+ valign ,*tr *td
80
+
81
+ # Purposefully omitted (will be dropped on parse)
82
+ # -- The event attributes on*, onmouse*, onkey*
@@ -0,0 +1,139 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- ruby -*-
3
+
4
+ #--
5
+ # Copyright (c) 2010-2011 David Kellum
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
8
+ # may not use this file except in compliance with the License. You may
9
+ # obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
16
+ # implied. See the License for the specific language governing
17
+ # permissions and limitations under the License.
18
+ #++
19
+
20
+ require 'erb'
21
+ require 'ostruct'
22
+
23
+ # Generator for HTML.java tags/attribute input configuration
24
+ class JavaGenerator
25
+
26
+ attr_reader :tags, :attributes
27
+
28
+ BASEDIR = File.dirname( __FILE__ )
29
+
30
+ JAVA_OUT = File.join( BASEDIR, '..', 'src',
31
+ 'main', 'java', 'iudex', 'html', 'HTML.java' )
32
+
33
+ def run( java_file = JAVA_OUT )
34
+ parse_tags
35
+ parse_attributes
36
+ map_basic_attributes
37
+ generate_java( java_file )
38
+ end
39
+
40
+ FLAGS = {
41
+ 'D' => 'DEPRECATED',
42
+ 'I' => 'INLINE',
43
+ 'M' => 'METADATA',
44
+ 'B' => 'BANNED' }
45
+
46
+ def parse_tags()
47
+ @tags = []
48
+
49
+ open( File.join( BASEDIR, 'tags' ), 'r' ) do |fin|
50
+ fin.each do |line|
51
+ case line
52
+ when /^\s*#/, /^\s*$/
53
+ # ignore comment, empty lines
54
+ when /^\s*[^\s,]+\s*,[^,]*,[^,]*$/
55
+ r = line.split(',').map { |c| c.strip }
56
+ r = r.compact.reject { |c| c.empty? }
57
+ flags = r[1].split(' ').map { |f| FLAGS[f] }.compact
58
+ @tags << OpenStruct.new( :name => r[0],
59
+ :flags => flags,
60
+ :desc => r[2] )
61
+ else
62
+ raise "Parse ERROR: line [#{line}]"
63
+ end
64
+ end
65
+ end
66
+
67
+ @tag_max_len = @tags.map { |t| t.name.length }.max
68
+ [ @tags ]
69
+ end
70
+
71
+ def parse_attributes()
72
+ @attributes = []
73
+ tagsets = {}
74
+
75
+ open( File.join( BASEDIR, 'attributes' ), 'r' ) do |fin|
76
+ fin.each do |line|
77
+ case line
78
+ when /^\s*#/, /^\s*$/
79
+ # ignore comment, empty lines
80
+ when /^\s*([A-Z]+)\s*::\s*ALL\s+except:(.*)$/
81
+ sname = $1
82
+ except = $2.split( ' ' ).compact.reject { |t| t.empty? }
83
+ tset = @tags.reject { |t| except.include?( t.name ) }
84
+ tset.map! { |t| t.name }
85
+ tagsets[sname] = tset
86
+ when /^\s*[^\s,]+\s*,/
87
+ r = line.split(',').map { |c| c.strip }
88
+ r = r.compact.reject { |c| c.empty? }
89
+ # FIXME: Handle attributes, desc.
90
+
91
+ btags = r[1].split(' ').compact.reject { |t| t.empty? || t =~ /^\*/ }
92
+ btags = btags.map { |t| tagsets[ t ] || t }.flatten
93
+
94
+ @attributes << OpenStruct.new( :name => r[0],
95
+ :basic_tags => btags,
96
+ :desc => r[2] )
97
+ else
98
+ raise "Parse ERROR: line [#{line}]"
99
+ end
100
+ end
101
+ end
102
+
103
+ def map_basic_attributes()
104
+ @tags.each do |tag|
105
+ tag.basic_atts =
106
+ @attributes.select { |attr| attr.basic_tags.include?( tag.name ) }
107
+ end
108
+ end
109
+
110
+ @attr_max_len = @attributes.map { |t| t.name.length }.max
111
+ [ @attributes ]
112
+ end
113
+
114
+ def twidth( val, extra = 0 )
115
+ val + ( ' ' * ( @tag_max_len - val.length + extra ) )
116
+ end
117
+
118
+ def awidth( val, extra = 0 )
119
+ val + ( ' ' * ( @attr_max_len - val.length + extra ) )
120
+ end
121
+
122
+ def const( val )
123
+ val.gsub( /\-/, '_' )
124
+ end
125
+
126
+ def generate_java( java_file )
127
+ erb_file = File.join( BASEDIR, 'HTML.java.erb' )
128
+ template = ERB.new( IO.read( erb_file ), nil, '%' )
129
+
130
+ open( java_file, 'w' ) do |fout|
131
+ fout << template.result( binding )
132
+ end
133
+ end
134
+
135
+ end
136
+
137
+ if $0 == __FILE__
138
+ JavaGenerator.new.run( *ARGV )
139
+ end
data/build/tags ADDED
@@ -0,0 +1,130 @@
1
+ # HTML Tags
2
+ #
3
+ # Copyright (c) 2010-2011 David Kellum
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
6
+ # may not use this file except in compliance with the License. You may
7
+ # obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14
+ # implied. See the License for the specific language governing
15
+ # permissions and limitations under the License.
16
+ #
17
+ # Sources
18
+ # http://www.w3.org/TR/xhtml11/
19
+ # http://www.w3.org/TR/html4/
20
+ # http://www.w3schools.com/tags/default.asp
21
+ # http://xhtml.com/
22
+ #
23
+ # Codes:
24
+ # E :: Empty Tag
25
+ # S :: In Strict HTML 4.01/XHTML 1.0
26
+ # T :: In Transitional HTML 4.01/XHTML 1.0
27
+ # F :: In frameset annex
28
+ # D :: Deprecated
29
+ # I :: Inline elements (Note <br/> is not labeled inline.)
30
+ # M :: Metadata elements (content not visible text), i.e. head
31
+ # B :: Banned/blacklisted elements from which text should not be extracted.
32
+
33
+ a , S T F I , anchor
34
+ abbr , S T F I , abbreviation
35
+ acronym , S T F I , acronym
36
+ address , S T F , contact information for the author or owner
37
+ applet , T F D , embedded applet
38
+ area ,E S T F , area inside an image-map
39
+ b , S T F I , bold text
40
+ base ,E S T F M , default address or a default target for all links on a page
41
+ basefont ,E T F D I M , default font; color; or size for the text in a page
42
+ bdo , S T F I , the text direction
43
+ big , S T F I , big text
44
+ blockquote , S T F , long quotation
45
+ body , S T F , the document's body
46
+ br ,E S T F , single line break
47
+ button , S T F I B, push button
48
+ caption , S T F , table caption
49
+ center , T F D , centered text
50
+ cite , S T F I , citation
51
+ code , S T F I , computer code text
52
+ col ,E S T F , attribute values for one or more columns in a table
53
+ colgroup , S T F , group of columns in a table for formatting
54
+ dd , S T F , description of a term in a definition list
55
+ del , S T F I , deleted text
56
+ dfn , S T F I , definition term
57
+ dir , T F D , directory list
58
+ div , S T F , section in a document
59
+ dl , S T F , definition list
60
+ dt , S T F , term (an item) in a definition list
61
+ em , S T F I , emphasized text
62
+ fieldset , S T F B, border around elements in a form
63
+ font , T F D I , font; color; or size for text
64
+ form , S T F , form for user input
65
+ frame ,E F B, window (a frame) in a frameset
66
+ frameset , F B, set of frames
67
+ h1 , S T F , heading level 1
68
+ h2 , S T F , heading level 2
69
+ h3 , S T F , heading level 3
70
+ h4 , S T F , heading level 4
71
+ h5 , S T F , heading level 5
72
+ h6 , S T F , heading level 6
73
+ head , S T F M , information about the document
74
+ hr ,E S T F , horizontal line
75
+ html , S T F , document
76
+ i , S T F I , italic text
77
+ iframe , T F , nline frame
78
+ img ,E S T F I , image
79
+ input ,E S T F I B, input control
80
+ ins , S T F I , inserted text
81
+ isindex , T F D , searchable index related to a document
82
+ kbd , S T F I , keyboard text
83
+ label , S T F I B, label for an input element
84
+ legend , S T F B, caption for a fieldset element
85
+ li , S T F , list item
86
+ link ,E S T F M , the relationship between a document and an external resource
87
+ map , S T F I , image-map
88
+ menu , T F D , menu list
89
+ meta ,E S T F M , metadata
90
+ noframes , T F B, alternate content where frames not supported
91
+ noscript , S T F B, alternate content script not supported
92
+ object , S T F I B, embedded object
93
+ ol , S T F , ordered list
94
+ optgroup , S T F B, group of related options in a select list
95
+ option , S T F B, option in a select list
96
+ p , S T F , paragraph
97
+ param ,E S T F , parameter for an object
98
+ pre , S T F , preformatted text
99
+ q , S T F I , short quotation
100
+ rb , , ruby base text
101
+ rbc , , ruby base container (complex)
102
+ rp , , ruby simple text container
103
+ rt , , ruby annotation text
104
+ rtc , , ruby text container (complex)
105
+ ruby , I , ruby pronunciation aid
106
+ s , T F D I , strikethrough text
107
+ samp , S T F I , sample computer code
108
+ script , S T F I B, client-side script
109
+ select , S T F I B, select list (drop-down list)
110
+ small , S T F I , small text
111
+ span , S T F I , section in a document
112
+ strike , T F D I , strikethrough text
113
+ strong , S T F I , strong text
114
+ style , S T F B, style information for a document
115
+ sub , S T F I , subscripted text
116
+ sup , S T F I , superscripted text
117
+ table , S T F , table
118
+ tbody , S T F , Groups the body content in a table
119
+ td , S T F , cell in a table
120
+ textarea , S T F I B, multi-line text input control
121
+ tfoot , S T F , Groups the footer content in a table
122
+ th , S T F , header cell in a table
123
+ thead , S T F , Groups the header content in a table
124
+ title , S T F M , the title of a document
125
+ tr , S T F , row in a table
126
+ tt , S T F I , teletype text
127
+ u , T F D I , underlined text
128
+ ul , S T F , unordered list
129
+ var , S T F I , variable part of a text
130
+ xmp , D , preformatted text