iudex-html 1.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.rdoc ADDED
@@ -0,0 +1,2 @@
1
+ === 1.0.0 (2011-04-04)
2
+ * Initial release.
data/Manifest.txt ADDED
@@ -0,0 +1,24 @@
1
+ History.rdoc
2
+ Manifest.txt
3
+ README.rdoc
4
+ Rakefile
5
+ pom.xml
6
+ build/HTML.java.erb
7
+ build/attributes
8
+ build/java_generate.rb
9
+ build/tags
10
+ lib/iudex-html/base.rb
11
+ lib/iudex-html.rb
12
+ lib/iudex-html/factory_helper.rb
13
+ test/html_test_helper.rb
14
+ test/setup.rb
15
+ test/test_characters_normalizer.rb
16
+ test/test_extract_filter.rb
17
+ test/test_factory_helper.rb
18
+ test/test_html_parser.rb
19
+ test/test_other_filters.rb
20
+ test/test_other_tree_filters.rb
21
+ test/test_parse_filter.rb
22
+ test/test_tree_walker.rb
23
+ test/test_word_counters.rb
24
+ lib/iudex-html/iudex-html-1.0.0.jar
data/README.rdoc ADDED
@@ -0,0 +1,25 @@
1
+ = iudex-html
2
+
3
+ * http://github.com/dekellum/iudex
4
+
5
+ == Description
6
+
7
+ Iudex is a general purpose web crawler and feed processor in
8
+ ruby/java. The iudex-html gem contains filters for HTML parsing,
9
+ filtering, exracting text and links.
10
+
11
+ == License
12
+
13
+ Copyright (c) 2010-2011 David Kellum
14
+
15
+ Licensed under the Apache License, Version 2.0 (the "License"); you
16
+ may not use this file except in compliance with the License. You
17
+ may obtain a copy of the License at:
18
+
19
+ http://www.apache.org/licenses/LICENSE-2.0
20
+
21
+ Unless required by applicable law or agreed to in writing, software
22
+ distributed under the License is distributed on an "AS IS" BASIS,
23
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24
+ implied. See the License for the specific language governing
25
+ permissions and limitations under the License.
data/Rakefile ADDED
@@ -0,0 +1,53 @@
1
+ # -*- ruby -*-
2
+
3
+ $LOAD_PATH << './lib'
4
+ require 'iudex-html/base'
5
+
6
+ require 'rubygems'
7
+ gem 'rjack-tarpit', '~> 1.2'
8
+ require 'rjack-tarpit'
9
+
10
+ t = RJack::TarPit.new( 'iudex-html',
11
+ Iudex::HTML::VERSION,
12
+ :no_assembly, :java_platform )
13
+
14
+ t.specify do |h|
15
+ h.developer( "David Kellum", "dek-oss@gravitext.com" )
16
+ h.extra_deps += [ [ 'iudex-core', '~> 1.0.0' ],
17
+ [ 'rjack-nekohtml', '~> 1.9.14' ],
18
+ [ 'gravitext-xmlprod', '~> 1.4.0' ] ]
19
+
20
+ h.testlib = :minitest
21
+ h.extra_dev_deps += [ [ 'minitest', '>= 1.7.1', '< 2.1' ],
22
+ [ 'rjack-logback', '~> 1.0' ] ]
23
+ end
24
+
25
+ file 'Manifest.txt' => [ 'pom.xml' ]
26
+
27
+ task :check_pom_version do
28
+ t.test_line_match( 'pom.xml', /<version>/, /#{t.version}/ )
29
+ end
30
+ task :check_history_version do
31
+ t.test_line_match( 'History.rdoc', /^==/, / #{t.version} / )
32
+ end
33
+ task :check_history_date do
34
+ t.test_line_match( 'History.rdoc', /^==/, /\([0-9\-]+\)$/ )
35
+ end
36
+
37
+ task :gem => [ :check_pom_version, :check_history_version ]
38
+ task :tag => [ :check_pom_version, :check_history_version, :check_history_date ]
39
+ task :push => [ :check_history_date ]
40
+
41
+ file 'target/.tarpit' => [ 'src/main/java/iudex/html/HTML.java' ]
42
+
43
+ file 'src/main/java/iudex/html/HTML.java' => FileList.new( "build/*" ) do
44
+ require 'build/java_generate'
45
+ puts "Generating HTML.java"
46
+ JavaGenerator.new.run
47
+ end
48
+
49
+ task :clean do
50
+ rm_f 'src/main/java/iudex/html/HTML.java'
51
+ end
52
+
53
+ t.define_tasks
@@ -0,0 +1,91 @@
1
+ /*
2
+ * Copyright (c) 2010-2011 David Kellum
3
+ *
4
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ * may not use this file except in compliance with the License. You may
6
+ * obtain a copy of the License at
7
+ *
8
+ * http://www.apache.org/licenses/LICENSE-2.0
9
+ *
10
+ * Unless required by applicable law or agreed to in writing, software
11
+ * distributed under the License is distributed on an "AS IS" BASIS,
12
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ * implied. See the License for the specific language governing
14
+ * permissions and limitations under the License.
15
+ */
16
+
17
+ package iudex.html;
18
+
19
+ import java.util.Arrays;
20
+ import java.util.Collections;
21
+ import java.util.List;
22
+ import java.util.HashMap;
23
+ import java.util.Map;
24
+
25
+ import iudex.html.HTMLTag.Flag;
26
+ import static iudex.html.HTMLTag.Flag.*;
27
+
28
+ import com.gravitext.xml.producer.Namespace;
29
+ import com.gravitext.xml.producer.Attribute;
30
+
31
+ /**
32
+ * HTML Tag constants
33
+ * This class is GENERATED by java_generate.rb.
34
+ */
35
+ public class HTML
36
+ {
37
+ public static final Namespace NS_XHTML =
38
+ new Namespace( Namespace.DEFAULT, "http://www.w3.org/1999/xhtml" );
39
+
40
+ public static final Map<String,HTMLTag> TAGS =
41
+ new HashMap<String,HTMLTag>( 127 );
42
+
43
+ public static final Map<String,Attribute> ATTRIBUTES =
44
+ new HashMap<String,Attribute>( 59 );
45
+
46
+ public static final List<Attribute> EMPTY_ATTS = Collections.emptyList();
47
+ % attributes.each do |a|
48
+ % if a.desc
49
+
50
+ /**
51
+ * Attribute <%= a.name %>: <%= a.desc %>
52
+ */
53
+ % end
54
+ public static final Attribute <%= awidth( 'ATTR_' + const( a.name.upcase ), 5 ) %> = attr( <%= awidth( '"' + a.name + '"', 2 ) %> );
55
+ % end
56
+
57
+ % tags.each do |tag|
58
+ % targs = [ '"' + tag.name + '"' ]
59
+ % basic_atts = tag.basic_atts.map { |a| 'ATTR_' + const( a.name.upcase ) }
60
+ % targs << if basic_atts.empty?
61
+ % "EMPTY_ATTS"
62
+ % else
63
+ % "Arrays.asList( #{ basic_atts.join( ', ' ) } )"
64
+ % end
65
+ % targs += tag.flags
66
+ % if tag.desc
67
+ /**
68
+ * Tag &lt;<%= tag.name %>>: <%= tag.desc %>
69
+ */
70
+ % end
71
+ public static final HTMLTag <%= twidth( tag.name.upcase ) %> =
72
+ tag( <%= targs.join( ', ' ) %> );
73
+
74
+ % end
75
+
76
+ private static HTMLTag tag( String name,
77
+ List<Attribute> basicAtts,
78
+ Flag...flags )
79
+ {
80
+ HTMLTag t = new HTMLTag( name, NS_XHTML, basicAtts, flags );
81
+ TAGS.put( t.name(), t );
82
+ return t;
83
+ }
84
+
85
+ private static Attribute attr( String name )
86
+ {
87
+ Attribute a = new Attribute( name, NS_XHTML );
88
+ ATTRIBUTES.put( a.name(), a );
89
+ return a;
90
+ }
91
+ }
data/build/attributes ADDED
@@ -0,0 +1,82 @@
1
+ # HTML Attributes
2
+ #
3
+ # Copyright (c) 2010-2011 David Kellum
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
6
+ # may not use this file except in compliance with the License. You may
7
+ # obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14
+ # implied. See the License for the specific language governing
15
+ # permissions and limitations under the License.
16
+ #
17
+ # Format:
18
+ # (CSV like) columns: name, tags, description
19
+ # Tags marked with asterisk (*): attribute is for style purposes only.
20
+ #
21
+ # Sources
22
+ # http://www.w3.org/TR/xhtml11/
23
+ # http://www.w3.org/TR/html4/
24
+ # http://www.w3schools.com/tags/ref_standardattributes.asp
25
+ # http://xhtml.com
26
+
27
+ CORE :: ALL except: base head html meta param script style title
28
+ class ,*CORE
29
+ id ,*CORE
30
+ style ,*CORE
31
+ title ,CORE, extra title
32
+
33
+ LANG :: ALL except: base br frame frameset hr iframe param
34
+ dir ,LANG, Text direction; ltr or rtl
35
+ lang ,LANG, language_code; also xml:lang
36
+
37
+ # Meta tag attributes
38
+ http-equiv ,meta, HTTP Header name
39
+ content ,meta, text
40
+ scheme ,meta, format URI
41
+
42
+ # Anchor and link attributes
43
+ charset ,a link, char_encoding of link
44
+ coords ,*a, coordinates; i.e. image map
45
+ hreflang ,link, language_code of referent
46
+ href ,a base link, URL
47
+ media ,link
48
+ name ,a, section_name anchor
49
+ rel ,a link
50
+ rev ,a link
51
+ shape ,*a
52
+ target ,*a *base *link
53
+ type ,link
54
+
55
+ # Image and some frame attributes
56
+ src ,frame img
57
+ alt ,img
58
+ height ,img *tr *th *td *iframe *object
59
+ width ,img *table *tr *th *td *iframe *object
60
+
61
+ # Table specific attributes
62
+ abbr ,tr th
63
+ align ,table tr td th iframe object
64
+ axis ,tr th
65
+ bgcolor ,*table *tr *td *th
66
+ border ,*table
67
+ cellpadding ,*table
68
+ cellspacing ,*table
69
+ char ,tr td th
70
+ charoff ,tr td th
71
+ colspan ,tr td th
72
+ frame ,*table
73
+ headers ,tr td
74
+ nowrap ,*tr *td *th
75
+ rowspan ,tr td th
76
+ rules ,*table
77
+ scope ,tr td th
78
+ summary ,table
79
+ valign ,*tr *td
80
+
81
+ # Purposefully omitted (will be dropped on parse)
82
+ # -- The event attributes on*, onmouse*, onkey*
@@ -0,0 +1,139 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- ruby -*-
3
+
4
+ #--
5
+ # Copyright (c) 2010-2011 David Kellum
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
8
+ # may not use this file except in compliance with the License. You may
9
+ # obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
16
+ # implied. See the License for the specific language governing
17
+ # permissions and limitations under the License.
18
+ #++
19
+
20
+ require 'erb'
21
+ require 'ostruct'
22
+
23
+ # Generator for HTML.java tags/attribute input configuration
24
+ class JavaGenerator
25
+
26
+ attr_reader :tags, :attributes
27
+
28
+ BASEDIR = File.dirname( __FILE__ )
29
+
30
+ JAVA_OUT = File.join( BASEDIR, '..', 'src',
31
+ 'main', 'java', 'iudex', 'html', 'HTML.java' )
32
+
33
+ def run( java_file = JAVA_OUT )
34
+ parse_tags
35
+ parse_attributes
36
+ map_basic_attributes
37
+ generate_java( java_file )
38
+ end
39
+
40
+ FLAGS = {
41
+ 'D' => 'DEPRECATED',
42
+ 'I' => 'INLINE',
43
+ 'M' => 'METADATA',
44
+ 'B' => 'BANNED' }
45
+
46
+ def parse_tags()
47
+ @tags = []
48
+
49
+ open( File.join( BASEDIR, 'tags' ), 'r' ) do |fin|
50
+ fin.each do |line|
51
+ case line
52
+ when /^\s*#/, /^\s*$/
53
+ # ignore comment, empty lines
54
+ when /^\s*[^\s,]+\s*,[^,]*,[^,]*$/
55
+ r = line.split(',').map { |c| c.strip }
56
+ r = r.compact.reject { |c| c.empty? }
57
+ flags = r[1].split(' ').map { |f| FLAGS[f] }.compact
58
+ @tags << OpenStruct.new( :name => r[0],
59
+ :flags => flags,
60
+ :desc => r[2] )
61
+ else
62
+ raise "Parse ERROR: line [#{line}]"
63
+ end
64
+ end
65
+ end
66
+
67
+ @tag_max_len = @tags.map { |t| t.name.length }.max
68
+ [ @tags ]
69
+ end
70
+
71
+ def parse_attributes()
72
+ @attributes = []
73
+ tagsets = {}
74
+
75
+ open( File.join( BASEDIR, 'attributes' ), 'r' ) do |fin|
76
+ fin.each do |line|
77
+ case line
78
+ when /^\s*#/, /^\s*$/
79
+ # ignore comment, empty lines
80
+ when /^\s*([A-Z]+)\s*::\s*ALL\s+except:(.*)$/
81
+ sname = $1
82
+ except = $2.split( ' ' ).compact.reject { |t| t.empty? }
83
+ tset = @tags.reject { |t| except.include?( t.name ) }
84
+ tset.map! { |t| t.name }
85
+ tagsets[sname] = tset
86
+ when /^\s*[^\s,]+\s*,/
87
+ r = line.split(',').map { |c| c.strip }
88
+ r = r.compact.reject { |c| c.empty? }
89
+ # FIXME: Handle attributes, desc.
90
+
91
+ btags = r[1].split(' ').compact.reject { |t| t.empty? || t =~ /^\*/ }
92
+ btags = btags.map { |t| tagsets[ t ] || t }.flatten
93
+
94
+ @attributes << OpenStruct.new( :name => r[0],
95
+ :basic_tags => btags,
96
+ :desc => r[2] )
97
+ else
98
+ raise "Parse ERROR: line [#{line}]"
99
+ end
100
+ end
101
+ end
102
+
103
+ def map_basic_attributes()
104
+ @tags.each do |tag|
105
+ tag.basic_atts =
106
+ @attributes.select { |attr| attr.basic_tags.include?( tag.name ) }
107
+ end
108
+ end
109
+
110
+ @attr_max_len = @attributes.map { |t| t.name.length }.max
111
+ [ @attributes ]
112
+ end
113
+
114
+ def twidth( val, extra = 0 )
115
+ val + ( ' ' * ( @tag_max_len - val.length + extra ) )
116
+ end
117
+
118
+ def awidth( val, extra = 0 )
119
+ val + ( ' ' * ( @attr_max_len - val.length + extra ) )
120
+ end
121
+
122
+ def const( val )
123
+ val.gsub( /\-/, '_' )
124
+ end
125
+
126
+ def generate_java( java_file )
127
+ erb_file = File.join( BASEDIR, 'HTML.java.erb' )
128
+ template = ERB.new( IO.read( erb_file ), nil, '%' )
129
+
130
+ open( java_file, 'w' ) do |fout|
131
+ fout << template.result( binding )
132
+ end
133
+ end
134
+
135
+ end
136
+
137
+ if $0 == __FILE__
138
+ JavaGenerator.new.run( *ARGV )
139
+ end
data/build/tags ADDED
@@ -0,0 +1,130 @@
1
+ # HTML Tags
2
+ #
3
+ # Copyright (c) 2010-2011 David Kellum
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
6
+ # may not use this file except in compliance with the License. You may
7
+ # obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14
+ # implied. See the License for the specific language governing
15
+ # permissions and limitations under the License.
16
+ #
17
+ # Sources
18
+ # http://www.w3.org/TR/xhtml11/
19
+ # http://www.w3.org/TR/html4/
20
+ # http://www.w3schools.com/tags/default.asp
21
+ # http://xhtml.com/
22
+ #
23
+ # Codes:
24
+ # E :: Empty Tag
25
+ # S :: In Strict HTML 4.01/XHTML 1.0
26
+ # T :: In Transitional HTML 4.01/XHTML 1.0
27
+ # F :: In frameset annex
28
+ # D :: Deprecated
29
+ # I :: Inline elements (Note <br/> is not labeled inline.)
30
+ # M :: Metadata elements (content not visible text), i.e. head
31
+ # B :: Banned/blacklisted elements from which text should not be extracted.
32
+
33
+ a , S T F I , anchor
34
+ abbr , S T F I , abbreviation
35
+ acronym , S T F I , acronym
36
+ address , S T F , contact information for the author or owner
37
+ applet , T F D , embedded applet
38
+ area ,E S T F , area inside an image-map
39
+ b , S T F I , bold text
40
+ base ,E S T F M , default address or a default target for all links on a page
41
+ basefont ,E T F D I M , default font; color; or size for the text in a page
42
+ bdo , S T F I , the text direction
43
+ big , S T F I , big text
44
+ blockquote , S T F , long quotation
45
+ body , S T F , the document's body
46
+ br ,E S T F , single line break
47
+ button , S T F I B, push button
48
+ caption , S T F , table caption
49
+ center , T F D , centered text
50
+ cite , S T F I , citation
51
+ code , S T F I , computer code text
52
+ col ,E S T F , attribute values for one or more columns in a table
53
+ colgroup , S T F , group of columns in a table for formatting
54
+ dd , S T F , description of a term in a definition list
55
+ del , S T F I , deleted text
56
+ dfn , S T F I , definition term
57
+ dir , T F D , directory list
58
+ div , S T F , section in a document
59
+ dl , S T F , definition list
60
+ dt , S T F , term (an item) in a definition list
61
+ em , S T F I , emphasized text
62
+ fieldset , S T F B, border around elements in a form
63
+ font , T F D I , font; color; or size for text
64
+ form , S T F , form for user input
65
+ frame ,E F B, window (a frame) in a frameset
66
+ frameset , F B, set of frames
67
+ h1 , S T F , heading level 1
68
+ h2 , S T F , heading level 2
69
+ h3 , S T F , heading level 3
70
+ h4 , S T F , heading level 4
71
+ h5 , S T F , heading level 5
72
+ h6 , S T F , heading level 6
73
+ head , S T F M , information about the document
74
+ hr ,E S T F , horizontal line
75
+ html , S T F , document
76
+ i , S T F I , italic text
77
+ iframe , T F , nline frame
78
+ img ,E S T F I , image
79
+ input ,E S T F I B, input control
80
+ ins , S T F I , inserted text
81
+ isindex , T F D , searchable index related to a document
82
+ kbd , S T F I , keyboard text
83
+ label , S T F I B, label for an input element
84
+ legend , S T F B, caption for a fieldset element
85
+ li , S T F , list item
86
+ link ,E S T F M , the relationship between a document and an external resource
87
+ map , S T F I , image-map
88
+ menu , T F D , menu list
89
+ meta ,E S T F M , metadata
90
+ noframes , T F B, alternate content where frames not supported
91
+ noscript , S T F B, alternate content script not supported
92
+ object , S T F I B, embedded object
93
+ ol , S T F , ordered list
94
+ optgroup , S T F B, group of related options in a select list
95
+ option , S T F B, option in a select list
96
+ p , S T F , paragraph
97
+ param ,E S T F , parameter for an object
98
+ pre , S T F , preformatted text
99
+ q , S T F I , short quotation
100
+ rb , , ruby base text
101
+ rbc , , ruby base container (complex)
102
+ rp , , ruby simple text container
103
+ rt , , ruby annotation text
104
+ rtc , , ruby text container (complex)
105
+ ruby , I , ruby pronunciation aid
106
+ s , T F D I , strikethrough text
107
+ samp , S T F I , sample computer code
108
+ script , S T F I B, client-side script
109
+ select , S T F I B, select list (drop-down list)
110
+ small , S T F I , small text
111
+ span , S T F I , section in a document
112
+ strike , T F D I , strikethrough text
113
+ strong , S T F I , strong text
114
+ style , S T F B, style information for a document
115
+ sub , S T F I , subscripted text
116
+ sup , S T F I , superscripted text
117
+ table , S T F , table
118
+ tbody , S T F , Groups the body content in a table
119
+ td , S T F , cell in a table
120
+ textarea , S T F I B, multi-line text input control
121
+ tfoot , S T F , Groups the footer content in a table
122
+ th , S T F , header cell in a table
123
+ thead , S T F , Groups the header content in a table
124
+ title , S T F M , the title of a document
125
+ tr , S T F , row in a table
126
+ tt , S T F I , teletype text
127
+ u , T F D I , underlined text
128
+ ul , S T F , unordered list
129
+ var , S T F I , variable part of a text
130
+ xmp , D , preformatted text